readMappings <- function(file, sep = "\t", IDsep = ",") { a <- read.delim(file = file, header = FALSE, quote = "", sep = sep, colClasses = "character") ## a bit of preprocesing to get it in a nicer form map <- a[, 2] names(map) <- gsub(" ", "", a[, 1]) ## trim the spaces ## split the IDs return(lapply(map, function(x) gsub(" ", "", strsplit(x, split = IDsep)[[1]]))) } gene2GO <- readMappings(file = "hs-ensembl2go.map") set.seed(length(gene2GO)) index <- sample(length(gene2GO), 100) str(head(gene2GO[index])) ## at this point we have the gene to GOs mapping and we can start building a topGOdata object. ## We need to generate a random list of interesting genes ## get the list of all gene ID's geneNames <- names(gene2GO[index]) ## select (or define) the list of interesting genes myInterestedGenes <- sample(geneNames, 20) str(geneNames) str(myInterestedGenes) ## make a indicator vector showing which genes are interesting geneList <- factor(as.integer(geneNames %in% myInterestedGenes)) names(geneList) <- geneNames str(geneList) library(topGO) ## build the topGOdata class ## there are three annotation functions available: ## 1. annFUN.db -- used for bioconductor affy chips ## 2. annFUN.gene2GO -- used when you have mappings from each gene to GOs ## 3. annFUN.GO2genes -- used when you have mappings from each GO to genes ## help("annFUN") GOdata <- new("topGOdata", ontology = "BP", allGenes = geneList, annot = annFUN.gene2GO, ## the new annotation function gene2GO = gene2GO[index]) ## the GO to gene ID's dataset ## write the gene2GO[index] into a file to use as an example in the vignette a <- lapply(gene2GO[index], function(x) paste(x, collapse = ", ")) mat <- cbind(names(a), unlist(a, use.names = FALSE)) write.table(mat, file = "geneid2go.map", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE) fn <- system.file("examples/geneid2go.map", package = "topGO")