# File src/library/tools/R/urltools.R # Part of the R package, https://www.R-project.org # # Copyright (C) 2015-2021 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # https://www.R-project.org/Licenses/ get_IANA_URI_scheme_db <- function() { ## See ## . baseurl <- "https://www.iana.org/assignments/uri-schemes/" db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")), stringsAsFactors = FALSE, encoding = "UTF-8") names(db) <- chartr(".", "_", names(db)) db } parse_URI_reference <- function(x) { ## See RFC_3986 . re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" if(length(x)) { y <- do.call(rbind, regmatches(x, regexec(re, x))) y <- y[, c(3, 5, 6, 8, 10), drop = FALSE] } else { y <- matrix(character(), 0L, 5L) } colnames(y) <- c("scheme", "authority", "path", "query", "fragment") y } .get_urls_from_Rd <- function(x, href = TRUE, ifdef = FALSE) { urls <- character() recurse <- function(e) { tag <- attr(e, "Rd_tag") ## Rd2HTML and Rd2latex remove whitespace and \n from URLs. if(identical(tag, "\\url")) { urls <<- c(urls, lines2str(.Rd_deparse(e, tag = FALSE))) } else if(href && identical(tag, "\\href")) { ## One could also record the \href text argument in the ## names, but then one would need to process named and ## unnamed extracted URLs separately. urls <<- c(urls, lines2str(.Rd_deparse(e[[1L]], tag = FALSE))) } else if(ifdef && length(tag) && (tag %in% c("\\if", "\\ifelse"))) { ## cf. testRdConditional() condition <- e[[1L]] if(all(RdTags(condition) == "TEXT")) { if(any(c("TRUE", "html") %in% trimws(strsplit(paste(condition, collapse = ""), ",")[[1L]]))) recurse(e[[2L]]) else if(tag == "\\ifelse") recurse(e[[3L]]) } } else if(is.list(e)) lapply(e, recurse) } lapply(x, recurse) unique(trimws(urls)) } .get_urls_from_HTML_file <- function(f) { doc <- xml2::read_html(f) if(!inherits(doc, "xml_node")) return(character()) nodes <- xml2::xml_find_all(doc, "//a") hrefs <- xml2::xml_attr(nodes, "href") unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")]) } .get_urls_from_PDF_file <- function(f) { ## Seems there is no straightforward way to extract hyperrefs from a ## PDF, hence first convert to HTML. ## Note that pdftohtml always outputs in cwd ... owd <- getwd() dir.create(d <- tempfile()) on.exit({ unlink(d, recursive = TRUE); setwd(owd) }) file.copy(normalizePath(f), d) setwd(d) g <- tempfile(tmpdir = d, fileext = ".xml") system2("pdftohtml", c("-s -q -i -c -xml", shQuote(basename(f)), shQuote(basename(g)))) ## Oh dear: seems that pdftohtml can fail without a non-zero exit ## status. if(file.exists(g)) .get_urls_from_HTML_file(g) else character() } url_db <- function(urls, parents) { ## Some people get leading LFs in URLs, so trim before checking. db <- data.frame(URL = trimws(as.character(urls)), Parent = as.character(parents), stringsAsFactors = FALSE) class(db) <- c("url_db", "data.frame") db } url_db_from_HTML_files <- function(dir, recursive = FALSE, files = NULL, verbose = FALSE) { urls <- parents <- character() if(is.null(files)) files <- list.files(dir, pattern = "[.]html$", full.names = TRUE, recursive = recursive) urls <- lapply(files, function(f) { if(verbose) message(sprintf("processing %s", .file_path_relative_to_dir(f, dir))) .get_urls_from_HTML_file(f) }) names(urls) <- files urls <- Filter(length, urls) if(length(urls)) { parents <- rep.int(.file_path_relative_to_dir(names(urls), dir), lengths(urls)) urls <- unlist(urls, use.names = FALSE) } url_db(urls, parents) } url_db_from_PDF_files <- function(dir, recursive = FALSE, files = NULL, verbose = FALSE) { urls <- parents <- character() if(is.null(files)) files <- list.files(dir, pattern = "[.]pdf$", full.names = TRUE, recursive = recursive) ## FIXME: this is simpler to do with full.names = FALSE and without ## tools:::.file_path_relative_to_dir(). urls <- lapply(files, function(f) { if(verbose) message(sprintf("processing %s", .file_path_relative_to_dir(f, dir))) .get_urls_from_PDF_file(f) }) names(urls) <- files urls <- Filter(length, urls) if(length(urls)) { parents <- rep.int(.file_path_relative_to_dir(names(urls), dir), lengths(urls)) urls <- unlist(urls, use.names = FALSE) } url_db(urls, parents) } url_db_from_package_Rd_db <- function(db) { urls <- Filter(length, lapply(db, .get_urls_from_Rd)) url_db(unlist(urls, use.names = FALSE), rep.int(file.path("man", names(urls)), lengths(urls))) } url_db_from_package_metadata <- function(meta) { urls <- character() fields <- c("URL", "BugReports") for(v in meta[fields]) { if(is.na(v)) next pattern <- "<(URL: *)?((https?|ftp)://[^[:space:],]*)[[:space:]]*>" m <- gregexpr(pattern, v) urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) regmatches(v, m) <- "" pattern <- "(^|[^>\"])((https?|ftp)://[^[:space:],]*)" m <- gregexpr(pattern, v) urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) } if(!is.na(v <- meta["Description"])) { pattern <- "<(URL: *)?((https?|ftp)://[^[:space:]]+)[[:space:]]*>" m <- gregexpr(pattern, v) urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) regmatches(v, m) <- "" pattern <- "([^>\"])((https?|ftp)://[[:alnum:]/.:@+\\_~%#?=&;,-]+[[:alnum:]/])" m <- gregexpr(pattern, v) urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) } url_db(urls, rep.int("DESCRIPTION", length(urls))) } url_db_from_package_citation <- function(dir, meta, installed = FALSE) { urls <- character() path <- if(installed) "CITATION" else file.path("inst", "CITATION") cfile <- file.path(dir, path) if(file.exists(cfile)) { cinfo <- .read_citation_quietly(cfile, meta) if(!inherits(cinfo, "error")) urls <- trimws(unique(unlist(cinfo$url, use.names = FALSE))) } url_db(urls, rep.int(path, length(urls))) } url_db_from_package_news <- function(dir, installed = FALSE) { path <- if(installed) "NEWS.Rd" else file.path("inst", "NEWS.Rd") nfile <- file.path(dir, path) urls <- if(file.exists(nfile)) { macros <- initialRdMacros() .get_urls_from_Rd(prepare_Rd(parse_Rd(nfile, macros = macros), stages = "install")) } else character() url_db(urls, rep.int(path, length(urls))) } url_db_from_package_HTML_files <- function(dir, installed = FALSE) { path <- if(installed) "doc" else file.path("inst", "doc") files <- Sys.glob(file.path(dir, path, "*.html")) if(installed && file.exists(rfile <- file.path(dir, "README.html"))) files <- c(files, rfile) url_db_from_HTML_files(dir, files = files) } url_db_from_package_README_md <- function(dir, installed = FALSE) { urls <- path <- character() rfile <- Filter(file.exists, c(if(!installed) file.path(dir, "inst", "README.md"), file.path(dir, "README.md")))[1L] if(!is.na(rfile) && nzchar(Sys.which("pandoc"))) { path <- .file_path_relative_to_dir(rfile, dir) tfile <- tempfile("README", fileext = ".html") on.exit(unlink(tfile)) out <- .pandoc_md_for_CRAN(rfile, tfile) if(!out$status) { urls <- .get_urls_from_HTML_file(tfile) } } url_db(urls, rep.int(path, length(urls))) } url_db_from_package_NEWS_md <- function(dir, installed = FALSE) { urls <- path <- character() nfile <- Filter(file.exists, c(if(!installed) file.path(dir, "inst", "NEWS.md"), file.path(dir, "NEWS.md")))[1L] if(!is.na(nfile) && nzchar(Sys.which("pandoc"))) { path <- .file_path_relative_to_dir(nfile, dir) tfile <- tempfile("NEWS", fileext = ".html") on.exit(unlink(tfile)) out <- .pandoc_md_for_CRAN(nfile, tfile) if(!out$status) { urls <- .get_urls_from_HTML_file(tfile) } } url_db(urls, rep.int(path, length(urls))) } url_db_from_package_sources <- function(dir, add = FALSE) { meta <- .read_description(file.path(dir, "DESCRIPTION")) db <- rbind(url_db_from_package_metadata(meta), url_db_from_package_Rd_db(Rd_db(dir = dir)), url_db_from_package_citation(dir, meta), url_db_from_package_news(dir)) if(requireNamespace("xml2", quietly = TRUE)) { db <- rbind(db, url_db_from_package_HTML_files(dir), url_db_from_package_README_md(dir), url_db_from_package_NEWS_md(dir) ) } if(add) db$Parent <- file.path(basename(dir), db$Parent) db } url_db_from_installed_packages <- function(packages, lib.loc = NULL, verbose = FALSE) { if(!length(packages)) return() one <- function(p) { if(verbose) message(sprintf("processing %s", p)) dir <- system.file(package = p, lib.loc = lib.loc) if(dir == "") return() meta <- .read_description(file.path(dir, "DESCRIPTION")) rddb <- Rd_db(p, lib.loc = dirname(dir)) db <- rbind(url_db_from_package_metadata(meta), url_db_from_package_Rd_db(rddb), url_db_from_package_citation(dir, meta, installed = TRUE), url_db_from_package_news(dir, installed = TRUE)) if(requireNamespace("xml2", quietly = TRUE)) { db <- rbind(db, url_db_from_package_HTML_files(dir, installed = TRUE), url_db_from_package_README_md(dir, installed = TRUE), url_db_from_package_NEWS_md(dir, installed = TRUE) ) } db$Parent <- file.path(p, db$Parent) db } do.call(rbind, c(lapply(packages, one), list(make.row.names = FALSE))) } get_IANA_HTTP_status_code_db <- function() { ## See ## baseurl <- "https://www.iana.org/assignments/http-status-codes/" db <- utils::read.csv(url(paste0(baseurl, "http-status-codes-1.csv")), stringsAsFactors = FALSE) ## Drop "Unassigned". db[db$Description != "Unassigned", ] } ## See ## and , ## Section 4.2.2 "Numeric Order List of Reply Codes", ## and , ## Section 5 "New FTP Replies". ## Only need those >= 400. table_of_FTP_server_return_codes <- c("421" = "Service not available, closing control connection.", "425" = "Can't open data connection.", "426" = "Connection closed; transfer aborted.", "430" = "Invalid username or password", "431" = "Need some unavailable resource to process security.", "434" = "Requested host unavailable.", "450" = "Requested file action not taken.", "451" = "Requested action aborted: local error in processing.", "452" = "Requested action not taken. Insufficient storage space in system.", "500" = "Syntax error, command unrecognized.", "501" = "Syntax error in parameters or arguments.", "502" = "Command not implemented.", "503" = "Bad sequence of commands.", "504" = "Command not implemented for that parameter.", "530" = "Not logged in.", "532" = "Need account for storing files.", "533" = "Command protection level denied for policy reasons.", "534" = "Request denied for policy reasons.", "535" = "Failed security check (hash, sequence, etc).", "536" = "Requested PROT level not supported by mechanism.", "537" = "Command protection level not supported by security mechanism.", "550" = "Requested action not taken. File unavailable", "551" = "Requested action aborted: page type unknown.", "552" = "Requested file action aborted. Exceeded storage allocation (for current directory or dataset).", "553" = "Requested action not taken. File name not allowed.", "631" = "Integrity protected reply.", "632" = "Confidentiality and integrity protected reply.", "633" = "Confidentiality protected reply." ) check_url_db <- function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL) { use_curl <- !parallel && config_val_to_logical(Sys.getenv("_R_CHECK_URLS_USE_CURL_", "TRUE")) && requireNamespace("curl", quietly = TRUE) if(parallel && is.null(pool)) pool <- curl::new_pool() .gather <- function(u = character(), p = list(), s = rep.int("", length(u)), m = rep.int("", length(u)), new = rep.int("", length(u)), cran = rep.int("", length(u)), spaces = rep.int("", length(u)), R = rep.int("", length(u))) { y <- data.frame(URL = u, From = I(p), Status = s, Message = m, New = new, CRAN = cran, Spaces = spaces, R = R, row.names = NULL, stringsAsFactors = FALSE) y$From <- p class(y) <- c("check_url_db", "data.frame") y } .fetch_headers <- if(parallel) function(urls) .fetch_headers_via_curl(urls, verbose, pool) else function(urls) .fetch_headers_via_base(urls, verbose) .check_ftp <- function(u, h) { if(inherits(h, "error")) { s <- "-1" msg <- sub("[[:space:]]*$", "", conditionMessage(h)) } else { s <- as.character(attr(h, "status")) msg <- table_of_FTP_server_return_codes[s] } c(s, msg, "", "") } .check_http <- if(remote) function(u, h) c(.check_http_A(u, h), .check_http_B(u)) else function(u, h) c(rep.int("", 3L), .check_http_B(u)) .check_http_A <- function(u, h) { newLoc <- "" if(inherits(h, "error")) { s <- "-1" msg <- sub("[[:space:]]*$", "", conditionMessage(h)) if(grepl(paste(c("server certificate verification failed", "failed to get server cert", "libcurl error code (51|60)"), collapse = "|"), msg)) { h2 <- tryCatch(curlGetHeaders(u, verify = FALSE), error = identity) s2 <- as.character(attr(h2, "status")) msg <- paste0(msg, "\n\t(Status without verification: ", table_of_HTTP_status_codes[s2], ")") } } else { s <- as.character(attr(h, "status")) msg <- table_of_HTTP_status_codes[s] } ## Look for redirected URLs ## According to ## the first ## line of a response is the status-line, with "a possibly empty ## textual phrase describing the status code", so only look for ## a 301 status code in the first line. if(grepl(" 301 ", h[1L], useBytes = TRUE) && !startsWith(u, "https://doi.org/") && !startsWith(u, "http://dx.doi.org/")) { ## Get the new location from the last consecutive 301 ## obtained. h <- split(h, c(0L, cumsum(h == "\r\n")[-length(h)])) i <- vapply(h, function(e) grepl(" 301 ", e[1L], useBytes = TRUE), NA) h <- h[[which(!i)[1L] - 1L]] pos <- grep("^[Ll]ocation: ", h, useBytes = TRUE) if(length(pos)) { loc <- sub("^[Ll]ocation: ([^\r]*)\r\n", "\\1", h[pos[1L]]) ## Ouch. According to RFC 7231, the location is a URI ## reference, and may be relative in which case it needs ## resolving against the effect request URI. ## . ## Not quite straightforward, hence do not report such ## 301s. ## (Alternatively, could try reporting the 301 but no ## new location.) if(nzchar(parse_URI_reference(loc)[1L, "scheme"])) newLoc <- loc ## (Note also that fragments would need extra care.) } } ## if((s != "200") && use_curl) { g <- .curl_GET_status(u) if(g == "200") { s <- g msg <- "OK" } } ## A mis-configured site if (s == "503" && any(grepl("www.sciencedirect.com", c(u, newLoc)))) s <- "405" c(s, msg, newLoc) } .check_http_B <- function(u) { ul <- tolower(u) cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) && !grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]_]+(html|pdf|rds)$", ul)) || (grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$", ul)) || startsWith(ul, "http://cran.r-project.org") || any(startsWith(ul, mirrors))) R <- grepl("^http://(www|bugs|journal).r-project.org", ul) spaces <- grepl(" ", u) c(if(cran) u else "", if(spaces) u else "", if(R) u else "") } bad <- .gather() if(!NROW(db)) return(bad) ## Could also use utils::getCRANmirrors(local.only = TRUE). mirrors <- c(utils::read.csv(file.path(R.home("doc"), "CRAN_mirrors.csv"), as.is = TRUE, encoding = "UTF-8")$URL, "http://cran.rstudio.com/", "https://cran.rstudio.com/") mirrors <- tolower(sub("/$", "", mirrors)) if(inherits(db, "check_url_db")) { ## Allow re-checking check results. parents <- db$From urls <- db$URL } else { parents <- split(db$Parent, db$URL) urls <- names(parents) } parts <- parse_URI_reference(urls) ## Empty URLs. ind <- apply(parts == "", 1L, all) if(any(ind)) { len <- sum(ind) bad <- rbind(bad, .gather(urls[ind], parents[ind], m = rep.int("Empty URL", len))) } ## Invalid URI schemes. schemes <- parts[, 1L] ind <- is.na(match(schemes, c("", IANA_URI_scheme_db$URI_Scheme, ## Also allow 'javascript' scheme, see ## ## (but apparently never registered with IANA). "javascript"))) if(any(ind)) { len <- sum(ind) msg <- rep.int("Invalid URI scheme", len) doi <- schemes[ind] == "doi" if(any(doi)) msg[doi] <- paste(msg[doi], "(use \\doi for DOIs in Rd markup)") bad <- rbind(bad, .gather(urls[ind], parents[ind], m = msg)) } ## ftp. pos <- which(schemes == "ftp") if(length(pos) && remote) { urlspos <- urls[pos] headers <- .fetch_headers(urlspos) results <- do.call(rbind, Map(.check_ftp, urlspos, headers)) status <- as.numeric(results[, 1L]) ind <- (status < 0L) | (status >= 400L) if(any(ind)) { pos <- pos[ind] s <- as.character(status[ind]) s[s == "-1"] <- "Error" m <- results[ind, 2L] m[is.na(m)] <- "" bad <- rbind(bad, .gather(urls[pos], parents[pos], s, m)) } } ## http/https. pos <- which(schemes == "http" | schemes == "https") if(length(pos)) { urlspos <- urls[pos] headers <- .fetch_headers(urlspos) results <- do.call(rbind, Map(.check_http, urlspos, headers)) status <- as.numeric(results[, 1L]) ## 405 is HTTP not allowing HEAD requests ## maybe also skip 500, 503, 504 as likely to be temporary issues ind <- is.na(match(status, c(200L, 405L, NA))) | nzchar(results[, 3L]) | nzchar(results[, 4L]) | nzchar(results[, 5L]) | nzchar(results[, 6L]) if(any(ind)) { pos <- pos[ind] s <- as.character(status[ind]) s[is.na(s)] <- "" s[s == "-1"] <- "Error" m <- results[ind, 2L] m[is.na(m)] <- "" bad <- rbind(bad, .gather(urls[pos], parents[pos], s, m, results[ind, 3L], results[ind, 4L], results[ind, 5L], results[ind, 6L])) } } bad } format.check_url_db <- function(x, ...) { if(!NROW(x)) return(character()) u <- x$URL new <- x$New ind <- nzchar(new) if(any(ind)) { u[ind] <- sprintf("%s (moved to %s)", u[ind], new[ind]) if(config_val_to_logical(Sys.getenv("_R_CHECK_URLS_SHOW_301_STATUS_", "FALSE"))) { x$Message[ind] <- "Moved Permanently" x$Status[ind] <- "301" } } paste0(sprintf("URL: %s", u), sprintf("\nFrom: %s", vapply(x$From, paste, "", collapse = "\n ")), ifelse((s <- x$Status) == "", "", sprintf("\nStatus: %s", s)), ifelse((m <- x$Message) == "", "", sprintf("\nMessage: %s", gsub("\n", "\n ", m, fixed=TRUE))), ifelse((m <- x$Spaces) == "", "", "\nURL contains spaces"), ifelse((m <- x$CRAN) == "", "", "\nCRAN URL not in canonical form"), ifelse((m <- x$R) == "", "", "\nR-project URL not in canonical form") ) } print.check_url_db <- function(x, ...) { if(NROW(x)) writeLines(paste(format(x), collapse = "\n\n")) invisible(x) } as.matrix.check_url_db <- function(x, ...) { n <- lengths(x[["From"]]) y <- do.call(cbind, c(list(URL = rep.int(x[["URL"]], n), Parent = unlist(x[["From"]])), lapply(x[-c(1L, 2L)], rep.int, n))) rownames(y) <- NULL y } .fetch_headers_via_base <- function(urls, verbose = FALSE, ids = urls) Map(function(u, verbose, i) { if(verbose) message(sprintf("processing %s", i)) tryCatch(curlGetHeaders(u), error = identity) }, urls, verbose, ids) .fetch_headers_via_curl <- function(urls, verbose = FALSE, pool = NULL) { .progress_bar <- function(length, msg = "") { bar <- new.env(parent = baseenv()) if(is.null(length)) { length <- 0L } ## ## make codetools happy done <- fmt <- NULL ## bar$length <- length bar$done <- -1L digits <- trunc(log10(length)) + 1L bar$fmt <- paste0("\r", msg, "[ %", digits, "i / %", digits, "i ]") bar$update <- function() { assign("done", inherits = TRUE, done + 1L) if (length <= 0L) { return() } if (done >= length) { cat("\r", strrep(" ", nchar(fmt)), "\r", sep = "") } else { cat(sprintf(fmt, done, length), sep = "") } } environment(bar$update) <- bar bar$update() bar } if(is.null(pool)) pool <- curl::new_pool() hs <- vector("list", length(urls)) bar <- .progress_bar(if (verbose) length(urls), msg = "fetching ") for(i in seq_along(hs)) { u <- urls[[i]] h <- curl::new_handle(url = u) curl::handle_setopt(h, nobody = TRUE, cookiesession = 1L, followlocation = 1L, http_version = 2L, ssl_enable_alpn = 0L) timeout <- as.integer(getOption("timeout")) if(!is.na(timeout) && (timeout > 0L)) curl::handle_setopt(h, connecttimeout = timeout, timeout = timeout) if(grepl("^https?://github[.]com", u) && nzchar(a <- Sys.getenv("GITHUB_PAT", ""))) { curl::handle_setheaders(h, "Authorization" = paste("token", a)) } handle_result <- local({ i <- i function(x) { hs[[i]] <<- x bar$update() } }) handle_error <- local({ i <- i function(x) { hs[[i]] <<- structure(list(message = x), class = c("curl_error", "error", "condition")) bar$update() } }) curl::multi_add(h, done = handle_result, fail = handle_error, pool = pool) } curl::multi_run(pool = pool) out <- vector("list", length(hs)) for(i in seq_along(out)) { if(inherits(hs[[i]], "error")) { out[[i]] <- hs[[i]] } else { out[[i]] <- strsplit(rawToChar(hs[[i]]$headers), "(?<=\r\n)", perl = TRUE)[[1L]] attr(out[[i]], "status") <- hs[[i]]$status_code } } out } .curl_GET_status <- function(u, verbose = FALSE) { if(verbose) message(sprintf("processing %s", u)) ## Configure curl handle for better luck with JSTOR URLs/DOIs. ## Alternatively, special-case requests to ## https?://doi.org/10.2307 ## https?://www.jstor.org h <- curl::new_handle() curl::handle_setopt(h, cookiesession = 1, followlocation = 1, http_version = 2L, ssl_enable_alpn = 0) timeout <- as.integer(getOption("timeout")) if(!is.na(timeout) && (timeout > 0L)) curl::handle_setopt(h, connecttimeout = timeout, timeout = timeout) if(startsWith(u, "https://github.com") && nzchar(a <- Sys.getenv("GITHUB_PAT", ""))) curl::handle_setheaders(h, "Authorization" = paste("token", a)) g <- tryCatch(curl::curl_fetch_memory(u, handle = h), error = identity) if(inherits(g, "error")) -1L else g$status_code }