## Overall strategy process the data in NCBI.sqlite and extract ## meaningful taxIDs. Then save those for use in the recipe above. ## Step 0: Run these functions from a directory where you have already ## built an NCBI.sqlite file. You may call the example in ## makeOrgPackageFromNCBI() ## You can run on an EC2 instance or locally ## If you run locally you will need at least 59G of space as of May 2021 ## the size of the files tends to increase slightly each release if(FALSE){ library(AnnotationForge) makeOrgPackageFromNCBI(version = "0.1",author = "Some One ", maintainer = "Some One ", outputDir = ".", tax_id = "59729", genus = "Taeniopygia", species = "guttata") } ## STEP 1: ## This helper will just get the taxIDs that we already have GO data for. .getCoreTaxIds <- function(NCBIFilesDir=getwd()){ ## connect to the DB require(RSQLite) NCBIcon <- dbConnect(SQLite(), dbname = "NCBI.sqlite") res = dbGetQuery(NCBIcon, "SELECT DISTINCT tax_id FROM gene2go;")[[1]] dbDisconnect(NCBIcon) res } coreIDs = .getCoreTaxIds() ## Next get the list of other taxIds where there is at least one GO term ## Step 2: .getAltTaxIds <- function(NCBIFilesDir=getwd()){ require(RSQLite) NCBIcon <- dbConnect(SQLite(), dbname = "NCBI.sqlite") ## Then get the go related Tax Ids sql <- paste0("SELECT distinct NCBItaxon FROM altGO") goTaxIds <- dbGetQuery(NCBIcon, sql)[[1]] ## And also the gene related tax Ids sql <- paste0("SELECT distinct tax_id FROM gene_info") geneTaxIds <- dbGetQuery(NCBIcon, sql)[[1]] tbl = intersect(goTaxIds, geneTaxIds) dbDisconnect(NCBIcon) tbl } altTaxIDs = .getAltTaxIds() ## Step 3: combine all taxIds and remove ones that we already ## have as packages: .getPackageOrgDbTaxIds <- function(){ # orgDbs <- .GetOrgDbs() # if orgDbs are not downloaded yet, do .getOrgDbs(TRUE) orgDbs <-AnnotationHubData:::.getOrgDbs(TRUE) # as.integer(unlist(lapply(orgDbs, # function(x){m <- metadata(x); m[m$name=='TAXID', 2] }))) as.integer(AnnotationHubData:::.orgDbPkgMetadataFromObjs(orgDbs, "3.15")$taxonomyId) } existingOrgPkgTaxIds <- .getPackageOrgDbTaxIds() allIds <- union(coreIDs, altTaxIDs) allIds <- allIds[!(allIds %in% existingOrgPkgTaxIds)] ## Step 4: sort the taxIds by the coverage for genes .getSortedTaxIds <- function(allIds, NCBIFilesDir=getwd()){ NCBIcon <- dbConnect(SQLite(), dbname = "NCBI.sqlite") ## ## Then get the go related Tax Ids idStr <- paste(allIds,collapse="','") sql <- paste0("SELECT tax_id FROM gene_info WHERE tax_id IN('",idStr,"')") giFullTaxIds <- dbGetQuery(NCBIcon, sql)[[1]] ## now I just need to quantify the number of each type taxidTable <- table(giFullTaxIds) tbl = names(sort(taxidTable, decreasing=TRUE))[1:1000] dbDisconnect(NCBIcon) tbl } results <- .getSortedTaxIds(allIds) save(results, file='viableIDs.rda') ## NCBIcon <- dbConnect(SQLite(), dbname = "NCBI.sqlite") ## ## Then get the go related Tax Ids ## idStr <- paste(allIds,collapse="','") ## sql <- paste0("SELECT tax_id FROM gene_info WHERE tax_id IN('",idStr,"')") ## giFullTaxIds <- dbGetQuery(NCBIcon, sql)[[1]] ## ## ## now I just need to quantify the number of each type ## taxidTable <- table(giFullTaxIds) ## head(sort(taxidTable, decreasing=TRUE)) ## The length of the allTaxIDs vector is: 555345! ## Match the existing tax IDs with a third function: