--- title: "Graveyard" subtitle: "Where failed code chunks & programs that don't really fut what I need go to die" --- Line 213 QC stuff prefetch SRR390728 fastq-dump SRR390728 seqtk seq -a SRR390728.fastq > SRR390728.fasta install.packages("SRAdb") biocLite("SRAtoolkit") library(SRAdb) library(SRAtoolkit) sratoolkit::prefetch("SRR390728") sratoolkit::fastqDump("SRR390728") library(Biostrings) fastq.file <- file.path(".", "SRR390728.fastq") fasq.file <- file.path(".", "SRR390728.fasta") fastq <- readDNAStringSet(fastq.file) writeXStringSet(fastq, fasq.file) EVERYTHING IS LEFT OVER HERE - GRAVEYARD ```{bash} #Query sequence 1: curl https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-zq-18/SRR019/19782/SRR19782039/SRR19782039.lite.1 \ -k \ > /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon/data/VC-Exposure/exposure ``` Query Sequence 2: Exposure to a synthetic hormone 17 a-Ethinylestradiol (EE2) ```{bash} # I know I need to put the sequences in a folder, but there has to be a format mistake I am making to get this in here. #Query sequence 1: curl https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos3/sra-pub-zq-24/SRR016/16771/SRR16771870/SRR16771870.lite.1 \ -k \ > /home/shared/8TB_HDD_02/cnmntgna/Github/chris-musselcon/data/EE2/hormone ``` Query Sequence 3: Diarrhetic Shellfish Poisoning (DSP) toxins associated with Harmful Algal Blooms (HABs) ```{bash} #Query sequence 1: curl https://trace.ncbi.nlm.nih.gov/Traces/?run=SRR19782039 \ -k \ > home/shared/8TB_HDD_02/cnmntgna/Github/chris-musselcon/data/DSP-HAB ``` Query Sequence 4: Hypoxia ```{bash} #Query sequence 1: curl https://trace.ncbi.nlm.nih.gov/Traces/?run=SRR19782039 \ -k \ > home/shared/8TB_HDD_02/cnmntgna/Github/chris-musselcon/data/Hypoxia ``` ```{bash} #keeping this chunk here to remind me where my BLAST is located, DO NOT RUN # cd /home/shared/8TB_HDD_02/cnmntgna/Applications/bioinfo/ # curl -O https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz # tar -xf ncbi-blast-2.13.0+-x64-linux.tar.gz ``` ```{bash} #Project path ls /home/shared/8TB_HDD_02/cnmntgna/GitHub/chris-musselcon ``` ```{bash} ``` ```{bash} #keeping to redo database if necessary # cd /home/shared/8TB_HDD_02/cnmntgna/data # curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz # mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_01.fasta.gz # gunzip -k uniprot_sprot_r2023_01.fasta.gz # ls /home/shared/8TB_HDD_02/cnmntgna/data ``` ```{bash} # keeping for database purposes #/home/shared/8TB_HDD_02/cnmntgna/Applications/bioinfo/ncbi-blast-2.13.0+/bin/makeblastdb \ #-in /home/shared/8TB_HDD_02/cnmntgna/data/uniprot_sprot_r2023_01.fasta \ #-dbtype prot \ #-out /home/shared/8TB_HDD_02/cnmntgna/output/blastdb/uniprot_sprot_r2023_01 ``` Query Sequence 1: Exposure to Valsartan & Carbamazepine I HAVE NOT ALTERED ANY OF THE CODE BELOW TO REFLECT MY PROJECT SEQUENCES ```{bash} head ../home/shared/8TB_HDD_02/cnmntgna/data/Ab_4denovo_CLC6_a.fa echo "How many sequences are there?" grep -c ">" ../data/Ab_4denovo_CLC6_a.fa ``` ```{bash} /home/shared/8TB_HDD_02/cnmntgna/Applications/bioinfo/ncbi-blast-2.13.0+/bin/blastx \ -query ../data/Ab_4denovo_CLC6_a.fa \ -db ../blastdb/uniprot_sprot_r2023_01 \ -out ../output/Ab_4-uniprot_blastx.tab \ -evalue 1E-20 \ -num_threads 20 \ -max_target_seqs 1 \ -outfmt 6 ``` ```{bash} head -2 ../home/shared/8TB_HDD_02/cnmntgna/output/Ab_4-uniprot_blastx.tab wc -l /home/shared/8TB_HDD_02/cnmntgna/output/Ab_4-uniprot_blastx.tab ``` ```{bash} curl -O "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/search?query=reviewed:true+AND+organism_id:9606" ``` ```{bash} curl -O -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_f%2Cgo%2Cgo_p%2Cgo_c%2Cgo_id%2Ccc_interaction%2Cec%2Cxref_reactome%2Cxref_unipathway%2Cxref_interpro&format=tsv&query=%28%2A%29%20AND%20%28reviewed%3Atrue%29" ``` ```{bash} head -2 /home/shared/8TB_HDD_02/cnmntgna/output/Ab_4-uniprot_blastx.tab wc -l ../output/Ab_4-uniprot_blastx.tab ``` ```{bash} tr '|' '\t' < /home/shared/8TB_HDD_02/cnmntgna/output/Ab_4-uniprot_blastx.tab | head -2 ``` ```{bash} tr '|' '\t' < /home/shared/8TB_HDD_02/cnmntgna/output/Ab_4-uniprot_blastx.tab \ > ../output/Ab_4-uniprot_blastx_sep.tab ``` ```{bash} head -2 /home/shared/8TB_HDD_02/cnmntgna/data/uniprot_table_r2023_01.tab wc -l ../data/uniprot_table_r2023_01.tab ``` ```{r} library(tidyverse) install.packages("kableExtra") library("kableExtra") ``` ```{r} bltabl <- read.csv("../output/Ab_4-uniprot_blastx_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("../data/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE) str(spgo) ``` ```{r} kbl( head( left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) ) ) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) ``` ```{r} left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs) %>% mutate(V1 = str_replace_all(V1, pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab")) %>% write_delim("../output/blast_annot_go.tab", delim = '\t') ``` ```{r} annot_tab <- read.csv("../output/blast_annot_go.tab", sep = '\t', header = TRUE) ``` Start here for the 'try it myself' code ```{r} library(DESeq2) library(Rsubread) # Set the working directory to where the FASTQ files are located setwd("") # Load the FASTQ files files <- list.files(pattern=".fastq.gz$") names(files) <- gsub(".fastq.gz$", "", files) samples <- data.frame(sampleName = names(files), fileName = files) # Align the reads to a reference genome index <- "path/to/reference/genome" align <- align(index, files, output.format="BAM", nthreads=8) # Perform differential gene expression analysis countTable <- featureCounts(files, annot.ext="path/to/gene/annotation", isGTFAnnotationFile=TRUE, nthreads=8) dds <- DESeqDataSetFromMatrix(countData=countTable[, 7:ncol(countTable)], colData=samples, design=~sampleName) dds <- DESeq(dds) res <- results(dds) ``` ======= Section 1 - load library Section 2 - pull data files Section 3 - replicate 4/4 workflow for qc this is where the QC stuff starts ```{r} # QC time # Code from the internet, will try fastqc first and then go to biostrings if stuck # library(Biostrings) #fastq <- readDNAStringSet("~ncbi/SRR7725722_1.fastq") #read_lengths <- nchar(fastq) #summary(read_lengths) #hist(read_lengths) #filtered_fastq <- FastqSampler(fastq, sample.keep = "highQuality") #writeXStringSet(filtered_fastq, "path/to/filtered_file.fastq") # more qc options in this package #qualityScoreDistribution #GCcontent #DkMer #trimLRPatterns() #trimLowQuality #subsetByWidth #alignQualityStats ``` ```{bash} pwd cd ../output pwd ls ``` ```{r} if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("ShortRead") ``` ```{bash} cd ../output/ncbi /home/shared/FastQC/fastqc *fastq -o . ```