## This script comprises extended tests. ##***************************************************************** ## Gviz stuff notrun_test_genetrack_df <- function(){ do.plot <- FALSE if(do.plot){ ##library(Gviz) options(ucscChromosomeNames=FALSE) data(geneModels) geneModels$chromosome <- 7 chr <- 7 start <- min(geneModels$start) end <- max(geneModels$end) myGeneModels <- getGeneRegionTrackForGviz(edb, chromosome=chr, start=start, end=end) ## chromosome has to be the same.... gtrack <- GenomeAxisTrack() gvizTrack <- GeneRegionTrack(geneModels, name="Gviz") ensdbTrack <- GeneRegionTrack(myGeneModels, name="ensdb") plotTracks(list(gtrack, gvizTrack, ensdbTrack)) plotTracks(list(gtrack, gvizTrack, ensdbTrack), from=26700000, to=26780000) ## Looks very nice... } ## Put the stuff below into the vignette: ## Next we get all lincRNAs on chromosome Y Lncs <- getGeneRegionTrackForGviz(edb, filter=list(SeqNameFilter("Y"), GeneBiotypeFilter("lincRNA"))) Prots <- getGeneRegionTrackForGviz(edb, filter=list(SeqNameFilter("Y"), GeneBiotypeFilter("protein_coding"))) if(do.plot){ plotTracks(list(gtrack, GeneRegionTrack(Lncs, name="lincRNAs"), GeneRegionTrack(Prots, name="proteins"))) plotTracks(list(gtrack, GeneRegionTrack(Lncs, name="lincRNAs"), GeneRegionTrack(Prots, name="proteins")), from=5000000, to=7000000, transcriptAnnotation="symbol") } ## is that the same than: TestL <- getGeneRegionTrackForGviz(edb, filter=list(GeneBiotypeFilter("lincRNA")), chromosome="Y", start=5000000, end=7000000) TestP <- getGeneRegionTrackForGviz(edb, filter=list(GeneBiotypeFilter("protein_coding")), chromosome="Y", start=5000000, end=7000000) if(do.plot){ plotTracks(list(gtrack, GeneRegionTrack(Lncs, name="lincRNAs"), GeneRegionTrack(Prots, name="proteins"), GeneRegionTrack(TestL, name="compareL"), GeneRegionTrack(TestP, name="compareP")), from=5000000, to=7000000, transcriptAnnotation="symbol") } expect_true(all(TestL$exon %in% Lncs$exon)) expect_true(all(TestP$exon %in% Prots$exon)) ## Crazy amazing stuff ## system.time( ## All <- getGeneRegionTrackForGviz(edb) ## ) } notrun_test_getSeqlengthsFromMysqlFolder <- function() { ## Test this for some more seqlengths. library(EnsDb.Rnorvegicus.v79) db <- EnsDb.Rnorvegicus.v79 seq_info <- seqinfo(db) seq_lengths <- ensembldb:::.getSeqlengthsFromMysqlFolder( organism = "Rattus norvegicus", ensembl = 79, seqnames = seqlevels(seq_info)) sl <- seqlengths(seq_info) sl_2 <- seq_lengths$length names(sl_2) <- rownames(seq_lengths) checkEquals(sl, sl_2) ## Mus musculus } notrun_test_ensDbFromGtf_Gff_AH <- function() { gtf <- paste0("/Users/jo/Projects/EnsDbs/80/caenorhabditis_elegans/", "Caenorhabditis_elegans.WBcel235.80.gtf.gz") outf <- tempfile() db <- ensDbFromGtf(gtf = gtf, outfile = outf) ## use Gff gff <- paste0("/Users/jo/Projects/EnsDbs/84/canis_familiaris/gff3/", "Canis_familiaris.CanFam3.1.84.gff3.gz") outf <- tempfile() db <- ensDbFromGff(gff, outfile = outf) ## Checking one from ensemblgenomes: gtf <- paste0("/Users/jo/Projects/EnsDbs/ensemblgenomes/30/", "solanum_lycopersicum/", "Solanum_lycopersicum.GCA_000188115.2.30.chr.gtf.gz" ) outf <- tempfile() db <- ensDbFromGtf(gtf = gtf, outfile = outf) gtf <- paste0("/Users/jo/Projects/EnsDbs/ensemblgenomes/30/", "solanum_lycopersicum/", "Solanum_lycopersicum.GCA_000188115.2.30.gtf.gz" ) outf <- tempfile() db <- ensDbFromGtf(gtf = gtf, outfile = outf) ## AH library(AnnotationHub) ah <- AnnotationHub() query(ah, c("release-83", "gtf")) ah_1 <- ah["AH50418"] db <- ensDbFromAH(ah_1, outfile = outf) ah_2 <- ah["AH50352"] db <- ensDbFromAH(ah_2, outfile = outf) } notrun_test_builds <- function(){ input <- "/Users/jo/Projects/EnsDbs/83/Homo_sapiens.GRCh38.83.gtf.gz" fromGtf <- ensDbFromGtf(input, outfile=tempfile()) ## provide wrong ensembl version fromGtf <- ensDbFromGtf(input, outfile=tempfile(), version="75") ## provide wrong genome version fromGtf <- ensDbFromGtf(input, outfile=tempfile(), genomeVersion="75") EnsDb(fromGtf) ## provide wrong organism fromGtf <- ensDbFromGtf(input, outfile=tempfile(), organism="blalba") EnsDb(fromGtf) ## GFF input <- "/Users/jo/Projects/EnsDbs/83/Homo_sapiens.GRCh38.83.chr.gff3.gz" fromGff <- ensDbFromGff(input, outfile=tempfile()) EnsDb(fromGff) fromGff <- ensDbFromGff(input, outfile=tempfile(), version="75") EnsDb(fromGff) fromGff <- ensDbFromGff(input, outfile=tempfile(), genomeVersion="bla") EnsDb(fromGff) fromGff <- ensDbFromGff(input, outfile=tempfile(), organism="blabla") EnsDb(fromGff) ## AH library(AnnotationHub) ah <- AnnotationHub() fromAh <- ensDbFromAH(ah["AH47963"], outfile=tempfile()) EnsDb(fromAH) fromAh <- ensDbFromAH(ah["AH47963"], outfile=tempfile(), version="75") EnsDb(fromAH) fromAh <- ensDbFromAH(ah["AH47963"], outfile=tempfile(), genomeVersion="bla") EnsDb(fromAH) fromAh <- ensDbFromAH(ah["AH47963"], outfile=tempfile(), organism="blabla") EnsDb(fromAH) } notrun_test_ensdbFromGFF <- function(){ library(ensembldb) ##library(rtracklayer) ## VERSION 83 gtf <- "/Users/jo/Projects/EnsDbs/83/Homo_sapiens.GRCh38.83.gtf.gz" fromGtf <- ensDbFromGtf(gtf, outfile=tempfile()) egtf <- EnsDb(fromGtf) gff <- "/Users/jo/Projects/EnsDbs/83/Homo_sapiens.GRCh38.83.gff3.gz" fromGff <- ensDbFromGff(gff, outfile=tempfile()) egff <- EnsDb(fromGff) ## Compare EnsDbs ensembldb:::compareEnsDbs(egtf, egff) ## OK, only Entrezgene ID "problems" ## Compare with the one built with the Perl API library(EnsDb.Hsapiens.v83) db <- EnsDb.Hsapiens.v83 ensembldb:::compareEnsDbs(egtf, edb) ensembldb:::compareEnsDbs(egff, edb) ## OK, I get different genes... genes1 <- genes(egtf) genes2 <- genes(edb) only2 <- genes2[!(genes2$gene_id %in% genes1$gene_id)] ## That below was before the fix to include feature type start_codon and stop_codon ## to the CDS type. ## Identify which are the different transcripts: txGtf <- transcripts(egtf) txGff <- transcripts(egff) commonIds <- intersect(names(txGtf), names(txGff)) haveCds <- commonIds[!is.na(txGtf[commonIds]$tx_cds_seq_start) & !is.na(txGff[commonIds]$tx_cds_seq_start)] diffs <- haveCds[txGtf[haveCds]$tx_cds_seq_start != txGff[haveCds]$tx_cds_seq_start] head(diffs) ## What could be reasons? ## 1) alternative CDS? ## Checking the GTF: ## tx ENST00000623834: start_codon: 195409 195411. ## first CDS: 195259 195411. ## last CDS: 185220 185350. ## stop_codon: 185217 185219. ## So, why the heck is the stop codon OUTSIDE the CDS??? ## library(rtracklayer) ## theGtf <- import(gtf, format="gtf") ## ## Apparently, the GTF contains the additional elements start_codon/stop_codon. ## theGff <- import(gff, format="gff3") ## transcripts(egtf, filter=TxIdFilter(diffs[1])) ## transcripts(egff, filter=TxIdFilter(diffs[1])) ## VERSION 81 ## Try to get the same via AnnotationHub gff <- "/Users/jo/Projects/EnsDbs/81/homo_sapiens/Homo_sapiens.GRCh38.81.gff3.gz" fromGff <- ensDbFromGff(gff, outfile=tempfile()) egff <- EnsDb(fromGff) gtf <- "/Users/jo/Projects/EnsDbs/81/homo_sapiens/Homo_sapiens.GRCh38.81.gtf.gz" fromGtf <- ensDbFromGtf(gtf, outfile=tempfile()) egtf <- EnsDb(fromGtf) ## Compare those two: ensembldb:::compareEnsDbs(egff, egtf) ## Why are there some differences in the transcripts??? trans1 <- transcripts(egff) trans2 <- transcripts(egtf) onlyInGtf <- trans2[!(trans2$tx_id %in% trans1$tx_id)] ##gtfGRanges <- ah["AH47963"] library(AnnotationHub) ah <- AnnotationHub() fromAh <- ensDbFromAH(ah["AH47963"], outfile=tempfile()) ## That's human... eah <- EnsDb(fromAh) ## Compare it to gtf: ensembldb:::compareEnsDbs(eah, egtf) ## OK. Same cds starts and cds ends. ## Compare it to gff: ensembldb:::compareEnsDbs(eah, egff) ## hm. ## Compare to EnsDb library(EnsDb.Hsapiens.v81) edb <- EnsDb.Hsapiens.v81 ensembldb:::compareEnsDbs(edb, egtf) ## Problem with CDS ensembldb:::compareEnsDbs(edb, egff) ## That's fine. ## Summary: ## GTF and AH are the same. ## GFF and Perl API are the same. ## OLD STUFF BELOW. ##fromAh <- EnsDbFromAH(ah["AH47963"], outfile=tempfile(), organism="Homo sapiens", version=81) ## Try with a fancy species: gff <- "/Users/jo/Projects/EnsDbs/83/gadus_morhua/Gadus_morhua.gadMor1.83.gff3.gz" fromGtf <- ensDbFromGff(gff, outfile=tempfile()) gff <- "/Users/jo/Projects/EnsDbs/83/rattus_norvegicus/Rattus_norvegicus.Rnor_6.0.83.gff3.gz" fromGff <- ensDbFromGff(gff, outfile=tempfile()) ## That works. ## Try with a file from AnnotationHub: Gorilla gorilla. library(AnnotationHub) ah <- AnnotationHub() ah <- ah["AH47962"] res <- ensDbFromAH(ah, outfile=tempfile()) edb <- EnsDb(res) genes(edb) ## ensRel <- query(ah, c("GTF", "ensembl")) ## gtf <- "/Users/jo/Projects/EnsDbs/83/Homo_sapiens.GRCh38.83.gtf.gz" ## ## GTF ## dir.create("/tmp/fromGtf") ## fromGtf <- ensDbFromGtf(gtf, path="/tmp/fromGtf", verbose=TRUE) ## ## GFF ## dir.create("/tmp/fromGff") ## fromGff <- ensembldb:::ensDbFromGff(gff, path="/tmp/fromGff", verbose=TRUE) ## ## ZBTB16: ## ## exon: ENSE00003606532 is 3rd exon of tx: ENST00000335953 ## ## exon: ENSE00003606532 is 3rd exon of tx: ENST00000392996 ## ## the Ensembl GFF has 2 entries for this exon. } ############################################################ ## Can not perform these tests right away, as they require a ## working MySQL connection. library(EnsDb.Hsapiens.v75) edb <- EnsDb.Hsapiens.v75 dontrun_test_useMySQL <- function() { edb_mysql <- useMySQL(edb, user = "anonuser", host = "localhost", pass = "") } dontrun_test_connect_EnsDb <- function() { library(RMySQL) con <- dbConnect(MySQL(), user = "anonuser", host = "localhost", pass = "") ensembldb:::listEnsDbs(dbcon = con) ## just with user. ensembldb:::listEnsDbs(user = "anonuser", host = "localhost", pass = "", port = 3306) ## Connecting directly to a EnsDb MySQL database. con <- dbConnect(MySQL(), user = "anonuser", host = "localhost", pass = "", dbname = "ensdb_hsapiens_v75") edb_mysql <- EnsDb(con) } notrun_compareEnsDbs <- function() { res <- ensembldb:::compareEnsDbs(edb, edb) } ############################################################ ## Massive test validating the cds: ## compare the length of the CDS with the length of the encoded protein. ## Get the CDS sequence, translate that and compare to protein sequence. notrun_massive_cds_test <- function() { ## Get all CDS: tx_cds <- cdsBy(edb, by = "tx", filter = SeqNameFilter(c(1:22, "X", "Y"))) prots <- proteins(edb, filter = TxIdFilter(names(tx_cds)), return.type = "AAStringSet") checkTrue(all(names(tx_cds) %in% mcols(prots)$tx_id)) tx_cds <- tx_cds[mcols(prots)$tx_id] ## Check that the length of the protein sequence is length of CDS/3 diff_width <- sum(width(tx_cds)) != width(prots) * 3 ## Why??? I've got some many differences here??? sum(diff_width) ## Check some of them manually in Ensembl ## 1st: - strand. tx_1 <- tx_cds[diff_width][1] ## Protein: 245aa prots[diff_width][1] ## OK. ## Tx 2206bp: exns <- exonsBy(edb, filter = TxIdFilter(names(tx_1))) sum(width(exns)) ## OK. ## Now to the CDS: cds_ex1 <- "ATGGCGTCCCCGTCTCGGAGACTGCAGACTAAACCAGTCATTACTTGTTTCAAGAGCGTTCTGCTAATCTACACTTTTATTTTCTGG" cds_ex2 <- "ATCACTGGCGTTATCCTTCTTGCAGTTGGCATTTGGGGCAAGGTGAGCCTGGAGAATTACTTTTCTCTTTTAAATGAGAAGGCCACCAATGTCCCCTTCGTGCTCATTGCTACTGGTACCGTCATTATTCTTTTGGGCACCTTTGGTTGTTTTGCTACCTGCCGAGCTTCTGCATGGATGCTAAAACTG" cds_ex3 <- "TATGCAATGTTTCTGACTCTCGTTTTTTTGGTCGAACTGGTCGCTGCCATCGTAGGATTTGTTTTCAGACATGAG" cds_ex4 <- "ATTAAGAACAGCTTTAAGAATAATTATGAGAAGGCTTTGAAGCAGTATAACTCTACAGGAGATTATAGAAGCCATGCAGTAGACAAGATCCAAAATACG" cds_ex5 <- "TTGCATTGTTGTGGTGTCACCGATTATAGAGATTGGACAGATACTAATTATTACTCAGAAAAAGGATTTCCTAAGAGTTGCTGTAAACTTGAAGATTGTACTCCACAGAGAGATGCAGACAAAGTAAACAATGAA" cds_ex6 <- "GGTTGTTTTATAAAGGTGATGACCATTATAGAGTCAGAAATGGGAGTCGTTGCAGGAATTTCCTTTGGAGTTGCTTGCTTCCAA" cds_ex7 <- "CTGATTGGAATCTTTCTCGCCTACTGCCTCTCTCGTGCCATAACAAATAACCAGTATGAGATAGTGTAA" cds_seq <- c(cds_ex1, cds_ex2, cds_ex3, cds_ex4, cds_ex5, cds_ex6, cds_ex7) nchar(cds_seq) width(tx_1) ## The length should be identical: checkEquals(sum(nchar(cds_seq)), sum(width(tx_1)), checkNames = FALSE) ## OK; so WHAT??? sum(width(tx_1)) / 3 ## So, start codon is encoded into a methionine. ## Stop codon is either a TAA, TGA or TAG. UAG can be encoded into Sec (U), UAG into Pyl (O) library(Biostrings) dna_s <- DNAString(paste0(cds_ex1, cds_ex2, cds_ex3, cds_ex4, cds_ex5, cds_ex6, cds_ex7)) translate(dna_s) ## Look at that! translate(DNAString("TAA")) ## -> translates into * translate(DNAString("TGA")) ## -> translates into * translate(DNAString("TAG")) ## -> translates into * translate(DNAString("ATG")) ## -> translates into M ## Assumption: ## If the mRNA ends with a TAA, the protein sequence will be 1aa shorter than ## length(CDS)/3. ## If the mRNA ends with a TAG, UAG the AA length is length(CDS)/3 ## Check one of the mRNA where it fits: tx_2 <- tx_cds[!diff_width][1] prots[!diff_width][1] ## AA is 137 long, ends with I. sum(width(tx_2)) / 3 ## OK ## Check Ensembl: tx_2_1 <- "ATGCTAAAACTG" tx_2_2 <- "TATGCAATGTTTCTGACTCTCGTTTTTTTGGTCGAACTGGTCGCTGCCATCGTAGGATTTGTTTTCAGACATGAG" tx_2_3 <- "ATTAAGAACAGCTTTAAGAATAATTATGAGAAGGCTTTGAAGCAGTATAACTCTACAGGAGATTATAGAAGCCATGCAGTAGACAAGATCCAAAATACG" tx_2_4 <- "TTGCATTGTTGTGGTGTCACCGATTATAGAGATTGGACAGATACTAATTATTACTCAGAAAAAGGATTTCCTAAGAGTTGCTGTAAACTTGAAGATTGTACTCCACAGAGAGATGCAGACAAAGTAAACAATGAA" tx_2_5 <- "GGTTGTTTTATAAAGGTGATGACCATTATAGAGTCAGAAATGGGAGTCGTTGCAGGAATTTCCTTTGGAGTTGCTTGCTTCCAA" tx_2_6 <- "GACATT" tx_2_cds <- paste0(tx_2_1, tx_2_2, tx_2_3, tx_2_4, tx_2_5, tx_2_6) nchar(tx_2_cds) sum(width(tx_2)) ## OK. translate(DNAString(tx_2_cds)) ## Next assumption: ## If we don't have a 3' UTR the AA sequence corresponds to length(CDS)/3 tx_cds <- cdsBy(edb, by = "tx", filter = SeqNameFilter(c(1:22, "X", "Y")), columns = c("tx_seq_start", "tx_seq_end", "tx_cds_seq_start", "tx_cds_seq_end")) prots <- proteins(edb, filter = TxIdFilter(names(tx_cds)), return.type = "AAStringSet") checkTrue(all(names(tx_cds) %in% mcols(prots)$tx_id)) tx_cds <- tx_cds[mcols(prots)$tx_id] ## Calculate the CDS width. tx_cds_width <- sum(width(tx_cds)) txs <- transcripts(edb, filter = TxIdFilter(names(tx_cds))) txs <- txs[names(tx_cds)] ## Subtract 3 from the width if we've got an 3'UTR. to_subtract <- rep(3, length(tx_cds_width)) to_subtract[((end(txs) == txs$tx_cds_seq_end) & as.logical(strand(txs) == "+")) | ((start(txs) == txs$tx_cds_seq_start) & as.logical(strand(txs) == "-"))] <- 0 tx_cds_width <- tx_cds_width - to_subtract ## Check that the length of the protein sequence is length of CDS/3 diff_width <- tx_cds_width != width(prots) * 3 ## Why??? I've got some many differences here??? sum(diff_width) length(diff_width) ## AAAA, still have some that don't fit!!! tx_3 <- tx_cds[diff_width][1] prots[diff_width][1] ## AA is 259aa long, ends with T., TX is: ENST00000371584 ## WTF, we've got no START CODON!!! sum(width(tx_3)) / 3 ## OMG!!! ## Now, exclude those without a 5' UTR: no_five <- ((start(txs) == txs$tx_cds_seq_start) & as.logical(strand(txs) == "+")) | ((end(txs) == txs$tx_cds_seq_end) & as.logical(strand(txs) == "-")) still_prot <- (diff_width & !no_five) } notrun_test_getGenomeFaFile <- function(){ library(EnsDb.Hsapiens.v82) edb <- EnsDb.Hsapiens.v82 ## We know that there is no Fasta file for that Ensembl release available. Fa <- getGenomeFaFile(edb) ## Got the one from Ensembl 81. genes <- genes(edb, filter=SeqNameFilter("Y")) geneSeqsFa <- getSeq(Fa, genes) ## Get the transcript sequences... txSeqsFa <- extractTranscriptSeqs(Fa, edb, filter=SeqNameFilter("Y")) ## Get the TwoBitFile. twob <- ensembldb:::getGenomeTwoBitFile(edb) ## Get thegene sequences. ## ERROR FIX BELOW WITH UPDATED VERSIONS!!! geneSeqs2b <- getSeq(twob, genes) ## Have to fix the seqnames. si <- seqinfo(twob) sn <- unlist(lapply(strsplit(seqnames(si), split=" ", fixed=TRUE), function(z){ return(z[1]) })) seqnames(si) <- sn seqinfo(twob) <- si ## Do the same with the TwoBitFile geneSeqsTB <- getSeq(twob, genes) ## Subset to all genes that are encoded on chromosomes for which ## we do have DNA sequence available. genes <- genes[seqnames(genes) %in% seqnames(seqinfo(Dna))] ## Get the gene sequences, i.e. the sequence including the sequence of ## all of the gene's exons and introns. geneSeqs <- getSeq(Dna, genes) library(AnnotationHub) ah <- AnnotationHub() quer <- query(ah, c("release-", "Homo sapiens")) ## So, I get 2bit files and toplevel stuff. Test <- ah[["AH50068"]] } notrun_test_extractTranscriptSeqs <- function(){ ## Note: we can't run that by default as we can not assume everybody has ## AnnotationHub and the required ressource installed. ## That's how we want to test the transcript seqs. genome <- getGenomeFaFile(edb) ZBTB <- extractTranscriptSeqs(genome, edb, filter=GenenameFilter("ZBTB16")) ## Load the sequences for one ZBTB16 transcript from FA. faf <- system.file("txt/ENST00000335953.fa.gz", package="ensembldb") Seqs <- readDNAStringSet(faf) tx <- "ENST00000335953" ## cDNA checkEquals(unname(as.character(ZBTB[tx])), unname(as.character(Seqs[grep(names(Seqs), pattern="cdna")]))) ## CDS cBy <- cdsBy(edb, "tx", filter=TxIdFilter(tx)) CDS <- extractTranscriptSeqs(genome, cBy) checkEquals(unname(as.character(CDS)), unname(as.character(Seqs[grep(names(Seqs), pattern="cds")]))) ## 5' UTR fBy <- fiveUTRsByTranscript(edb, filter=TxIdFilter(tx)) UTR <- extractTranscriptSeqs(genome, fBy) checkEquals(unname(as.character(UTR)), unname(as.character(Seqs[grep(names(Seqs), pattern="utr5")]))) ## 3' UTR tBy <- threeUTRsByTranscript(edb, filter=TxIdFilter(tx)) UTR <- extractTranscriptSeqs(genome, tBy) checkEquals(unname(as.character(UTR)), unname(as.character(Seqs[grep(names(Seqs), pattern="utr3")]))) ## Another gene on the reverse strand: faf <- system.file("txt/ENST00000200135.fa.gz", package="ensembldb") Seqs <- readDNAStringSet(faf) tx <- "ENST00000200135" ## cDNA cDNA <- extractTranscriptSeqs(genome, edb, filter=TxIdFilter(tx)) checkEquals(unname(as.character(cDNA)), unname(as.character(Seqs[grep(names(Seqs), pattern="cdna")]))) ## do the same, but from other strand exns <- exonsBy(edb, "tx", filter=TxIdFilter(tx)) cDNA <- extractTranscriptSeqs(genome, exns) checkEquals(unname(as.character(cDNA)), unname(as.character(Seqs[grep(names(Seqs), pattern="cdna")]))) strand(exns) <- "+" cDNA <- extractTranscriptSeqs(genome, exns) checkTrue(unname(as.character(cDNA)) != unname(as.character(Seqs[grep(names(Seqs), pattern="cdna")]))) ## CDS cBy <- cdsBy(edb, "tx", filter=TxIdFilter(tx)) CDS <- extractTranscriptSeqs(genome, cBy) checkEquals(unname(as.character(CDS)), unname(as.character(Seqs[grep(names(Seqs), pattern="cds")]))) ## 5' UTR fBy <- fiveUTRsByTranscript(edb, filter=TxIdFilter(tx)) UTR <- extractTranscriptSeqs(genome, fBy) checkEquals(unname(as.character(UTR)), unname(as.character(Seqs[grep(names(Seqs), pattern="utr5")]))) ## 3' UTR tBy <- threeUTRsByTranscript(edb, filter=TxIdFilter(tx)) UTR <- extractTranscriptSeqs(genome, tBy) checkEquals(unname(as.character(UTR)), unname(as.character(Seqs[grep(names(Seqs), pattern="utr3")]))) } notrun_test_getCdsSequence <- function(){ ## That's when we like to get the sequence from the coding region. genome <- getGenomeFaFile(edb) tx <- extractTranscriptSeqs(genome, edb, filter=SeqNameFilter("Y")) cdsSeq <- extractTranscriptSeqs(genome, cdsBy(edb, filter=SeqNameFilter("Y"))) ## that's basically to get the CDS sequence. ## UTR sequence: tutr <- extractTranscriptSeqs(genome, threeUTRsByTranscript(edb, filter=SeqNameFilter("Y"))) futr <- extractTranscriptSeqs(genome, fiveUTRsByTranscript(edb, filter=SeqNameFilter("Y"))) theTx <- "ENST00000602770" fullSeq <- as.character(tx[theTx]) ## build the one from 5', cds and 3' compSeq <- "" if(any(names(futr) == theTx)) compSeq <- paste0(compSeq, as.character(futr[theTx])) if(any(names(cdsSeq) == theTx)) compSeq <- paste0(compSeq, as.character(cdsSeq[theTx])) if(any(names(tutr) == theTx)) compSeq <- paste(compSeq, as.character(tutr[theTx])) checkEquals(unname(fullSeq), compSeq) } notrun_test_cds <- function(){ library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene cds <- cds(txdb) cby <- cdsBy(txdb, by="tx") gr <- cby[[7]][1] seqlevels(gr) <- sub(seqlevels(gr), pattern="chr", replacement="") tx <- transcripts(edb, filter=GRangesFilter(gr, condition="overlapping")) cby[[7]] ## Note: so that fits! And we've to include the stop_codon feature for GTF import! ## Make an TxDb from GTF: gtf <- "/Users/jo/Projects/EnsDbs/75/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz" library(GenomicFeatures) Test <- makeTxDbFromGFF(gtf, format="gtf", organism="Homo sapiens") scds <- cdsBy(Test, by="tx") gr <- scds[[7]][1] tx <- transcripts(edb, filter=GRangesFilter(gr, condition="overlapping")) scds[[7]] ## Compare: ## TxDb form GTF has: 865692 879533 ## EnsDb: 865692 879533 ## Next test: gr <- scds[[2]][1] tx <- transcripts(edb, filter=GRangesFilter(gr, condition="overlapping")) tx scds[[2]] ## start_codon: 367659 367661, stop_codon: 368595 368597 CDS: 367659 368594. ## TxDb from GTF includes the stop_codon! } dontrun_benchmark_ordering_genes <- function() { .withR <- function(x, ...) { ensembldb:::orderResultsInR(x) <- TRUE genes(x, ...) } .withSQL <- function(x, ...) { ensembldb:::orderResultsInR(x) <- FALSE genes(x, ...) } library(microbenchmark) microbenchmark(.withR(edb), .withSQL(edb), times = 10) ## same microbenchmark(.withR(edb, columns = c("gene_id", "tx_id")), .withSQL(edb, columns = c("gene_id", "tx_id")), times = 10) ## R slightly faster. microbenchmark(.withR(edb, columns = c("gene_id", "tx_id"), SeqNameFilter("Y")), .withSQL(edb, columns = c("gene_id", "tx_id"), SeqNameFilter("Y")), times = 10) ## same. } ## We aim to fix issue #11 by performing the ordering in R instead ## of SQL. Thus, we don't want to run this as a "regular" test ## case. dontrun_test_ordering_cdsBy <- function() { doBench <- FALSE if (doBench) library(microbenchmark) .withR <- function(x, ...) { ensembldb:::orderResultsInR(x) <- TRUE cdsBy(x, ...) } .withSQL <- function(x, ...) { ensembldb:::orderResultsInR(x) <- FALSE cdsBy(x, ...) } res_sql <- .withSQL(edb) res_r <- .withR(edb) checkEquals(res_sql, res_r) if (dobench) microbenchmark(.withSQL(edb), .withR(edb), times = 3) ## R slightly faster. res_sql <- .withSQL(edb, filter = SeqNameFilter("Y")) res_r <- .withR(edb, filter = SeqNameFilter("Y")) checkEquals(res_sql, res_r) if (dobench) microbenchmark(.withSQL(edb, filter = SeqNameFilter("Y")), .withR(edb, filter = SeqNameFilter("Y")), times = 10) ## R 6x faster. } dontrun_test_ordering_exonsBy <- function() { doBench <- FALSE if (doBench) library(microbenchmark) .withR <- function(x, ...) { ensembldb:::orderResultsInR(x) <- TRUE exonsBy(x, ...) } .withSQL <- function(x, ...) { ensembldb:::orderResultsInR(x) <- FALSE exonsBy(x, ...) } res_sql <- .withSQL(edb) res_r <- .withR(edb) checkEquals(res_sql, res_r) if (doBench) microbenchmark(.withSQL(edb), .withR(edb), times = 3) ## about the same; R slightly faster. ## with using a SeqNameFilter in addition. res_sql <- .withSQL(edb, filter = SeqNameFilter("Y")) res_r <- .withR(edb, filter = SeqNameFilter("Y")) ## query takes longer. checkEquals(res_sql, res_r) if (doBench) microbenchmark(.withSQL(edb, filter = SeqNameFilter("Y")), .withR(edb, filter = SeqNameFilter("Y")), times = 3) ## SQL twice as fast. ## Now getting stuff by gene res_sql <- .withSQL(edb, by = "gene") res_r <- .withR(edb, by = "gene") ## checkEquals(res_sql, res_r) ## Differences due to ties if (doBench) microbenchmark(.withSQL(edb, by = "gene"), .withR(edb, by = "gene"), times = 3) ## SQL faster; ??? ## Along with a SeqNameFilter res_sql <- .withSQL(edb, by = "gene", filter = SeqNameFilter("Y")) res_r <- .withR(edb, by = "gene", filter = SeqNameFilter("Y")) ## Why does the query take longer for R??? ## checkEquals(res_sql, res_r) ## Differences due to ties if (doBench) microbenchmark(.withSQL(edb, by = "gene", filter = SeqNameFilter("Y")), .withR(edb, by = "gene", filter = SeqNameFilter("Y")), times = 3) ## SQL faster. ## Along with a GeneBiotypeFilter if (doBench) microbenchmark(.withSQL(edb, by = "gene", filter = GeneBiotypeFilter("protein_coding")) , .withR(edb, by = "gene", filter = GeneBiotypeFilter("protein_coding")) , times = 3) } dontrun_test_ordering_transcriptsBy <- function() { .withR <- function(x, ...) { ensembldb:::orderResultsInR(x) <- TRUE transcriptsBy(x, ...) } .withSQL <- function(x, ...) { ensembldb:::orderResultsInR(x) <- FALSE transcriptsBy(x, ...) } res_sql <- .withSQL(edb) res_r <- .withR(edb) checkEquals(res_sql, res_r) microbenchmark(.withSQL(edb), .withR(edb), times = 3) ## same speed res_sql <- .withSQL(edb, filter = SeqNameFilter("Y")) res_r <- .withR(edb, filter = SeqNameFilter("Y")) checkEquals(res_sql, res_r) microbenchmark(.withSQL(edb, filter = SeqNameFilter("Y")), .withR(edb, filter = SeqNameFilter("Y")), times = 3) ## SQL slighly faster. } dontrun_query_tune <- function() { ## Query tuning: library(RSQLite) con <- dbconn(edb) Q <- "select distinct tx2exon.exon_id,exon.exon_seq_start,exon.exon_seq_end,gene.seq_name,tx2exon.tx_id,gene.seq_strand,tx2exon.exon_idx from gene join tx on (gene.gene_id=tx.gene_id) join tx2exon on (tx.tx_id=tx2exon.tx_id) join exon on (tx2exon.exon_id=exon.exon_id) where gene.seq_name = 'Y'" system.time(dbGetQuery(con, Q)) Q2 <- "select distinct tx2exon.exon_id,exon.exon_seq_start,exon.exon_seq_end,gene.seq_name,tx2exon.tx_id,gene.seq_strand,tx2exon.exon_idx from exon join tx2exon on (tx2exon.exon_id = exon.exon_id) join tx on (tx2exon.tx_id = tx.tx_id) join gene on (gene.gene_id=tx.gene_id) where gene.seq_name = 'Y'" system.time(dbGetQuery(con, Q2)) Q3 <- "select distinct tx2exon.exon_id,exon.exon_seq_start,exon.exon_seq_end,gene.seq_name,tx2exon.tx_id,gene.seq_strand,tx2exon.exon_idx from tx2exon join exon on (tx2exon.exon_id = exon.exon_id) join tx on (tx2exon.tx_id = tx.tx_id) join gene on (gene.gene_id=tx.gene_id) where gene.seq_name = 'Y'" system.time(dbGetQuery(con, Q3)) Q4 <- "select distinct tx2exon.exon_id,exon.exon_seq_start,exon.exon_seq_end,gene.seq_name,tx2exon.tx_id,gene.seq_strand,tx2exon.exon_idx from tx2exon join exon on (tx2exon.exon_id = exon.exon_id) join tx on (tx2exon.tx_id = tx.tx_id) join gene on (gene.gene_id=tx.gene_id) where gene.seq_name = 'Y' order by tx.tx_id" system.time(dbGetQuery(con, Q4)) Q5 <- "select distinct tx2exon.exon_id,exon.exon_seq_start,exon.exon_seq_end,gene.seq_name,tx2exon.tx_id,gene.seq_strand,tx2exon.exon_idx from tx2exon inner join exon on (tx2exon.exon_id = exon.exon_id) inner join tx on (tx2exon.tx_id = tx.tx_id) inner join gene on (gene.gene_id=tx.gene_id) where gene.seq_name = 'Y' order by tx.tx_id" system.time(dbGetQuery(con, Q5)) Q6 <- "select distinct tx2exon.exon_id,exon.exon_seq_start,exon.exon_seq_end,gene.seq_name,tx2exon.tx_id,gene.seq_strand,tx2exon.exon_idx from gene inner join tx on (gene.gene_id=tx.gene_id) inner join tx2exon on (tx.tx_id=tx2exon.tx_id) inner join exon on (tx2exon.exon_id=exon.exon_id) where gene.seq_name = 'Y' order by tx.tx_id asc" system.time(dbGetQuery(con, Q6)) } ## implement: ## .checkOrderBy: checks order.by argument removing columns that are ## not present in the database ## orderBy columns are added to the columns. ## .orderDataFrameBy: orders the dataframe by the specified columns. notrun_test_protein_domains <- function() { res <- ensembldb:::getWhat(edb, columns = c("protein_id", "tx_id", "gene_id", "gene_name"), filter = list(ProtDomIdFilter("PF00096"))) } notrun_compare_full <- function(){ ## That's on the full thing. ## Test if the result has the same ordering than the transcripts call. allTx <- transcripts(edb) txLen <- transcriptLengths(edb, with.cds_len=TRUE, with.utr5_len=TRUE, with.utr3_len=TRUE) checkEquals(names(allTx), rownames(txLen)) system.time( futr <- fiveUTRsByTranscript(edb) ) ## 23 secs. futrLen <- sum(width(futr)) ## do I need reduce??? checkEquals(unname(futrLen), txLen[names(futrLen), "utr5_len"]) ## 3' system.time( tutr <- threeUTRsByTranscript(edb) ) system.time( tutrLen <- sum(width(tutr)) ) checkEquals(unname(tutrLen), txLen[names(tutrLen), "utr3_len"]) } notrun_compare_to_genfeat <- function(){ library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene system.time( Len <- transcriptLengths(edb) ) ## Woa, 52 sec system.time( txLen <- lengthOf(edb, "tx") ) ## Faster, 31 sec checkEquals(Len$tx_len, unname(txLen[rownames(Len)])) system.time( Len2 <- transcriptLengths(txdb) ) ## :) 2.5 sec. ## Next. system.time( Len <- transcriptLengths(edb, with.cds_len = TRUE) ) ## 56 sec system.time( Len2 <- transcriptLengths(txdb, with.cds_len=TRUE) ) ## 4 sec. ## Calling the transcriptLengths of GenomicFeatures on the EnsDb. system.time( Def <- GenomicFeatures::transcriptLengths(edb) ) ## 26.5 sec system.time( WithCds <- GenomicFeatures::transcriptLengths(edb, with.cds_len=TRUE) ) ## 55 sec system.time( WithAll <- GenomicFeatures::transcriptLengths(edb, with.cds_len=TRUE, with.utr5_len=TRUE, with.utr3_len=TRUE) ) ## 99 secs ## Get my versions... system.time( MyDef <- ensembldb:::.transcriptLengths(edb) ) ## 31 sec system.time( MyWithCds <- ensembldb:::.transcriptLengths(edb, with.cds_len=TRUE) ) ## 44 sec system.time( MyWithAll <- ensembldb:::.transcriptLengths(edb, with.cds_len=TRUE, with.utr5_len=TRUE, with.utr3_len=TRUE) ) ## 63 sec ## Should be all the same!!! rownames(MyDef) <- NULL checkEquals(Def, MyDef) ## rownames(MyWithCds) <- NULL MyWithCds[is.na(MyWithCds$cds_len), "cds_len"] <- 0 checkEquals(WithCds, MyWithCds) ## rownames(MyWithAll) <- NULL MyWithAll[is.na(MyWithAll$cds_len), "cds_len"] <- 0 MyWithAll[is.na(MyWithAll$utr3_len), "utr3_len"] <- 0 MyWithAll[is.na(MyWithAll$utr5_len), "utr5_len"] <- 0 checkEquals(WithAll, MyWithAll) }