## The aim here is to sketch out and then implement a strategy for ## making all the org packages quickly by calling the makeOrgPackage() ## on code processed from the FTP file downloaded by the NCBI_ftp.R ## source. ## GOALS: ## 1) I want this to be performant. It cannot take 30 minutes to ## download the whole FTP file and split that every time (and most of ## the time is in the splitting). I need to do the initial processing ## step once and then run the recipe on some generated stuff that ## comes out of that. - DONE ## 2) I want to pre-filter based on a. whether the tax ID is legit, ## b. whether there are records for that id and c. whether there are ## GO annotations for that organism either at NCBI OR at blast2GO. ## 3) Some examples (see below) don't work, if it's because there is ## no data- FINE. But the code needs to be more robust AND I need to ## not have these critters in the pre-filtered set. ## 4) It might makes sense to use the makeOrgPackage code as a base ## for making the classic org packages FIRST (it's basically a good ## opportunity to refactor). ## So lets explore an example and see what is actually the slow part... ## slow step is the part where we discard data. debug(makeOrgDbFromNCBI) debug(AnnotationForge:::.makeBaseDBFromDLs) debug(AnnotationForge:::.saveFiles) debug(AnnotationForge:::.downloadData) debug(AnnotationForge:::.getFiles) debug(AnnotationForge:::.writeToNCBIDB) debug(AnnotationForge:::.indexTaxIds) ## test for older stuff library(AnnotationForge) makeOrgPackageFromNCBI(version = "0.1", author = "Some One ", maintainer = "Some One ", outputDir = ".", tax_id = "9606", genus = "Homo", species = "sapiens", NCBIFilesDir=".", useDeprecatedStyle=TRUE) ## WORKS debug(AnnotationForge:::prepareDataFromNCBI) ## Another test for building the new data.frames out debug(AnnotationForge:::prepareDataFromNCBI) debug(AnnotationForge:::.getBlast2GO) library(AnnotationForge) ## So this is an example where there are not any refseq or genbank IDs ## (tax ID is too general) makeOrgPackageFromNCBI(version="0.11", maintainer="g ", author="g ", outputDir=".", tax_id="4530", genus="OryZa", species="sativa", NCBIFilesDir=".") ## Whereas this one works just fine makeOrgPackageFromNCBI(version="0.11", maintainer="g ", author="g ", outputDir=".", tax_id="39947", genus="OryZa", species="sativa.japonica", NCBIFilesDir=".", databaseOnly=TRUE) ## The following all works: makeOrgPackageFromNCBI(version = "0.1", author = "Some One ", maintainer = "Some One ", outputDir = ".", tax_id = "192222", # genus = "Campylobacter", # species = "jejuni", NCBIFilesDir=".", databaseOnly=TRUE) ## This example demonstrates the limitations of using a lookup table (IOW unsupported characters can end up in the package name) ## Lets see if this will guess the genus and species here makeOrgPackageFromNCBI(version="0.1", maintainer="Pengfei Liu ", author="Pengfei Liu ", outputDir=".", tax_id="1041930", # genus="Methanocella", # species="conradii", NCBIFilesDir=".") ## And this simply fails to find a match for this species... ## There is no helping this situation since I have not thrown out any of the available scientific names. Users will simply have to provide a name in this case. ## Barley works... makeOrgPackageFromNCBI(version = "0.1", author = "Some One ", maintainer = "Some One ", outputDir = ".", tax_id = "112509", genus = "Hordeum", species = "vulgare", NCBIFilesDir=".") ## Axolotl works... makeOrgPackageFromNCBI(version = "0.1", author = "Some One ", maintainer = "Some One ", outputDir = ".", tax_id = "8296", genus = "Ambystoma", species = "mexicanum", NCBIFilesDir=".") ############################################################################### ## Next up: code for telling which things are viable: ## Code should 1) see if NCBI.sqlite 'cache db' exists (& if not - ## make one) and then 2) use the data in there to pre-compute which ## things are viable for making into DBs. ## (Also I need to work out how to know which things I can get data ## from blast2GO for) - pre-download this too? ############################################################################### ## AND refactor code to support the "NOSCHEMA.DB" : done ## Compare DB generated to the one made by makeOrgPackage() - ## NOSCHEMA.DB is better design. ## the older ORGANISM.db schema was unecessarily complicated. - I ## should really upgrade all these things to the new stuff right ## now. - and ALSO: the new schema allows us to make compatible with ## NOSCHEMACHIP.db packages (simpler and more extensible). ## ALSO: the new schema is less complex and better organized since it ## is generated by a hueristic (instead of being bespoke). ## What I need to do here is to support the NOSCHEMA.DB schema and ## still support making the ORGANISM.DB packages via an argument to ## makeOrgPackageFromNCBI. Old school packages should be born in a ## 'deprecated' state. (warn people to not use them and the argument ## should also be documented in a way that discourages it's use) ########################################################################## ## ALSO: While I am in here I need to find a better way to deal with this: ## Basically I need an exception in place for the select code and also ## a more general solution for duplicated stuff ## goids <- keys(mouse4302.db, "GO") ## xx = select(mouse4302.db, head(goids), "GOALL", "GO") ## which takes forever and returns many duplicates ## The above is a select bug that I need to fix in AnnotationDbi... ################################################################################ ################################################################################ ################################################################################ ################################################################################ ################################################################################ ## THIS is a test to see if we can really build 1100 org Dbs objects! ## Separately, I have made a list of taxIDs that I should be able to quickly process into a bunch of DBs. I aim to test that here... ## First lets just get all the viable taxIds: ## These are the IDs (prescreened) for us to process load(system.file('extdata','viableIDs.rda', package='AnnotationForge')) ids <- names(results)[results] ## old school table of tax Ids load(system.file('extdata','specData.rda', package='GenomeInfoDb')) sd <- specData[!is.na(specData[[3]]),] ## Some taxonomy IDs cannot be looked up at all - so discard ids <- ids[ids %in% sd$tax_id] ## AND remove this one bad one that we discovered (an overly general barley ID) ids <- ids[!(ids %in% '4513')] ## Now ids should be OK (verified that there are species etc. before) ## need to find offenders lookup <- function(id){ message(paste0("looking up value for: ", id)) AnnotationForge:::lookup_organism_by_tax_id(id) } res <- lapply(ids,lookup) ## Then run all (1146) of these to make sure they all work. library(AnnotationForge) ## constant variables version <- '3.0.0' author = "Marc Carlson" maintainer = "Bioconductor Package Maintainer " NCBIFilesDir="." ## mapply variables taxonomyId <- as.character(unlist(lapply(res, function(x){x$tax_id}))) genus <- unlist(lapply(res, function(x){x$genus})) species <- unlist(lapply(res, function(x){x$species})) ## Some name cleanup: genus <- gsub(" ", "_", genus) genus <- gsub("/", "|", genus) species <- gsub(" ", "_", species) species <- gsub("/", "|", species) ## Then go Map(makeOrgPackageFromNCBI, tax_id=taxonomyId, genus=genus, species=species, MoreArgs=list( version=version, author=author, maintainer=maintainer, NCBIFilesDir=NCBIFilesDir, databaseOnly=TRUE )) ## Map(makeOrgPackageFromNCBI, ## tax_id=taxonomyId[1:2], ## genus=genus[1:2], ## species=species[1:2], ## MoreArgs=list( ## version=version, ## author=author, ## maintainer=maintainer, ## NCBIFilesDir=NCBIFilesDir, ## databaseOnly=TRUE ## ))