From 3ec4cd423eb80d91905f1105c48d2dd420f9ea41 Mon Sep 17 00:00:00 2001 From: Chao Xu Date: Sat, 9 Jan 2016 17:03:39 -0800 Subject: [PATCH] linkchecker tool now visits the URL to determine if it's valid --- Makefile | 1 - cmd/linkcheck/links.go | 147 ++++++++++++++++++++++----- docs/devel/how-to-doc.md | 4 - docs/proposals/node-allocatable.md | 4 +- hack/after-build/verify-linkcheck.sh | 39 +++++-- hack/verify-all.sh | 2 +- hooks/pre-commit | 10 -- shippable.yml | 1 - 8 files changed, 157 insertions(+), 51 deletions(-) diff --git a/Makefile b/Makefile index 61e5039350..361dc15715 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,6 @@ verify: hack/verify-generated-deep-copies.sh hack/verify-generated-docs.sh hack/verify-swagger-spec.sh - hack/verify-linkcheck.sh hack/verify-flags-underscore.py hack/verify-godeps.sh $(BRANCH) hack/verify-godep-licenses.sh $(BRANCH) diff --git a/cmd/linkcheck/links.go b/cmd/linkcheck/links.go index 638a593884..099277ca6d 100644 --- a/cmd/linkcheck/links.go +++ b/cmd/linkcheck/links.go @@ -14,56 +14,155 @@ See the License for the specific language governing permissions and limitations under the License. */ -// This command checks if the hyperlinks in files are valid. It checks the files -// with 'fileSuffix' in 'rootDir' for URLs that match 'prefix'. It trims the -// 'prefix' from the URL, uses what's left as the relative path to repoRoot to -// verify if the link is valid. For example: -// $ linkcheck --root-dir=${TYPEROOT} --repo-root=${KUBE_ROOT} \ -// --file-suffix=types.go --prefix=http://releases.k8s.io/HEAD +// This tool extracts the links from types.go and .md files, visits the link and +// checks the status code of the response. +// Usage: +// $ linkcheck --root-dir=${ROOT} package main import ( "fmt" "io/ioutil" + "net/http" "os" - "path" "path/filepath" "regexp" + "strconv" "strings" + "time" + "github.com/mvdan/xurls" flag "github.com/spf13/pflag" ) var ( - httpRE *regexp.Regexp - rootDir = flag.String("root-dir", "", "Root directory containing documents to be processed.") - repoRoot = flag.String("repo-root", "", `Root directory of k8s repository.`) - fileSuffix = flag.String("file-suffix", "", "suffix of files to be checked") - prefix = flag.String("prefix", "", "Longest common prefix of the link URL, e.g., http://release.k8s.io/HEAD/ for links in pkg/api/types.go") + fileSuffix = flag.StringSlice("file-suffix", []string{"types.go", ".md"}, "suffix of files to be checked") + // URLs matching the patterns in the regWhiteList won't be checked. Patterns + // of dummy URLs should be added to the list to avoid false alerts. Also, + // patterns of URLs that we don't care about can be added here to improve + // efficiency. + regWhiteList = []*regexp.Regexp{ + regexp.MustCompile(`https://kubernetes-site\.appspot\.com`), + // skip url that doesn't start with an English alphabet, e.g., URLs with IP addresses. + regexp.MustCompile(`https?://[^A-Za-z].*`), + regexp.MustCompile(`https?://localhost.*`), + } + // URLs listed in the fullURLWhiteList won't be checked. This separated from + // the RegWhiteList to improve efficiency. This list includes dummy URLs that + // are hard to be generalized by a regex, and URLs that will cause false alerts. + fullURLWhiteList = map[string]struct{}{ + "http://github.com/some/repo.git": {}, + // This URL returns 404 when visited by this tool, but it works fine if visited by a browser. + "http://stackoverflow.com/questions/ask?tags=kubernetes": {}, + "https://github.com/$YOUR_GITHUB_USERNAME/kubernetes.git": {}, + "https://github.com/$YOUR_GITHUB_USERNAME/kubernetes": {}, + "http://storage.googleapis.com/kubernetes-release/release/v${K8S_VERSION}/bin/darwin/amd64/kubectl": {}, + // It seems this server expects certain User-Agent value, it works fine with Chrome, but returns 404 if we issue a plain cURL to it. + "http://supervisord.org/": {}, + "http://kubernetes.io/vX.Y/docs": {}, + "http://kubernetes.io/vX.Y/docs/": {}, + "http://kubernetes.io/vX.Y/": {}, + } + + visitedURLs = map[string]struct{}{} + htmlpreviewReg = regexp.MustCompile(`https://htmlpreview\.github\.io/\?`) + httpOrhttpsReg = regexp.MustCompile(`https?.*`) ) -func newWalkFunc(invalidLink *bool) filepath.WalkFunc { +func newWalkFunc(invalidLink *bool, client *http.Client) filepath.WalkFunc { return func(filePath string, info os.FileInfo, err error) error { - if !strings.HasSuffix(info.Name(), *fileSuffix) { + hasSuffix := false + for _, suffix := range *fileSuffix { + hasSuffix = hasSuffix || strings.HasSuffix(info.Name(), suffix) + } + if !hasSuffix { return nil } + fileBytes, err := ioutil.ReadFile(filePath) if err != nil { return err } foundInvalid := false - matches := httpRE.FindAllSubmatch(fileBytes, -1) - for _, match := range matches { - // match[1] should look like docs/devel/api-conventions.md - if _, err := os.Stat(path.Join(*repoRoot, string(match[1]))); err != nil { - fmt.Fprintf(os.Stderr, "Link is not valid: %s\n", string(match[0])) + allURLs := xurls.Strict.FindAll(fileBytes, -1) + fmt.Fprintf(os.Stdout, "\nChecking file %s\n", filePath) + URL: + for _, URL := range allURLs { + // Don't check non http/https URL + if !httpOrhttpsReg.Match(URL) { + continue + } + for _, whiteURL := range regWhiteList { + if whiteURL.Match(URL) { + continue URL + } + } + if _, found := fullURLWhiteList[string(URL)]; found { + continue + } + // remove the htmlpreview Prefix + processedURL := htmlpreviewReg.ReplaceAll(URL, []byte{}) + + // check if we have visited the URL. + if _, found := visitedURLs[string(processedURL)]; found { + continue + } + visitedURLs[string(processedURL)] = struct{}{} + + retry := 0 + const maxRetry int = 3 + backoff := 100 + for retry < maxRetry { + fmt.Fprintf(os.Stdout, "Visiting %s\n", string(processedURL)) + // Use verb HEAD to increase efficiency. However, some servers + // do not handle HEAD well, so we need to try a GET to avoid + // false alert. + resp, err := client.Head(string(processedURL)) + // URLs with mock host or mock port will cause error. If we report + // the error here, people need to add the mock URL to the white + // list every time they add a mock URL, which will be a maintenance + // nightmare. Hence, we decide to only report 404 to catch the + // cases where host and port are legit, but path is not, which + // is the most common mistake in our docs. + if err != nil { + break + } + if resp.StatusCode == 429 { + retryAfter := resp.Header.Get("Retry-After") + if seconds, err := strconv.Atoi(retryAfter); err != nil { + backoff = seconds + 10 + } + fmt.Fprintf(os.Stderr, "Got %d visiting %s, retry after %d seconds.\n", resp.StatusCode, string(URL), backoff) + time.Sleep(time.Duration(backoff) * time.Second) + backoff *= 2 + retry++ + } else if resp.StatusCode == 404 { + // We only check for 404 error for now. 401, 403 errors are hard to handle. + + // We need to try a GET to avoid false alert. + resp, err = client.Get(string(processedURL)) + if err != nil { + break + } + if resp.StatusCode != 404 { + continue URL + } + + foundInvalid = true + fmt.Fprintf(os.Stderr, "Failed: in file %s, Got %d visiting %s\n", filePath, resp.StatusCode, string(URL)) + break + } else { + break + } + } + if retry == maxRetry { foundInvalid = true + fmt.Fprintf(os.Stderr, "Failed: in file %s, still got 429 visiting %s after %d retries\n", filePath, string(URL), maxRetry) } } if foundInvalid { - fmt.Fprintf(os.Stderr, "Found invalid links in %s\n", filePath) *invalidLink = true } return nil @@ -72,14 +171,16 @@ func newWalkFunc(invalidLink *bool) filepath.WalkFunc { func main() { flag.Parse() - httpRE = regexp.MustCompile(*prefix + `(.*\.md)`) - if *rootDir == "" || *repoRoot == "" || *prefix == "" { + if *rootDir == "" { flag.Usage() os.Exit(2) } + client := http.Client{ + Timeout: time.Duration(5 * time.Second), + } invalidLink := false - if err := filepath.Walk(*rootDir, newWalkFunc(&invalidLink)); err != nil { + if err := filepath.Walk(*rootDir, newWalkFunc(&invalidLink, &client)); err != nil { fmt.Fprintf(os.Stderr, "Fail: %v.\n", err) os.Exit(2) } diff --git a/docs/devel/how-to-doc.md b/docs/devel/how-to-doc.md index 7f1d30bab7..2c50861159 100644 --- a/docs/devel/how-to-doc.md +++ b/docs/devel/how-to-doc.md @@ -18,10 +18,6 @@ If you are using a released version of Kubernetes, you should refer to the docs that go with that version. - -The latest 1.1.x release of this document can be found -[here](http://releases.k8s.io/release-1.1/docs/devel/how-to-doc.md). - Documentation for other releases can be found at [releases.k8s.io](http://releases.k8s.io). diff --git a/docs/proposals/node-allocatable.md b/docs/proposals/node-allocatable.md index c915bb6a5b..a3520a41ad 100644 --- a/docs/proposals/node-allocatable.md +++ b/docs/proposals/node-allocatable.md @@ -65,7 +65,7 @@ reservation grows), or running multiple Kubelets on a single node. ![image](node-allocatable.png) 1. **Node Capacity** - Already provided as - [`NodeStatus.Capacity`](https://htmlpreview.github.io/?https://github.com/kubernetes/kubernetes/HEAD/docs/api-reference/v1/definitions.html#_v1_nodestatus), + [`NodeStatus.Capacity`](https://htmlpreview.github.io/?https://github.com/kubernetes/kubernetes/blob/HEAD/docs/api-reference/v1/definitions.html#_v1_nodestatus), this is total capacity read from the node instance, and assumed to be constant. 2. **System-Reserved** (proposed) - Compute resources reserved for processes which are not managed by Kubernetes. Currently this covers all the processes lumped together in the `/system` raw @@ -81,7 +81,7 @@ reservation grows), or running multiple Kubelets on a single node. #### Allocatable Add `Allocatable` (4) to -[`NodeStatus`](https://htmlpreview.github.io/?https://github.com/kubernetes/kubernetes/HEAD/docs/api-reference/v1/definitions.html#_v1_nodestatus): +[`NodeStatus`](https://htmlpreview.github.io/?https://github.com/kubernetes/kubernetes/blob/HEAD/docs/api-reference/v1/definitions.html#_v1_nodestatus): ``` type NodeStatus struct { diff --git a/hack/after-build/verify-linkcheck.sh b/hack/after-build/verify-linkcheck.sh index c3865bcd74..64fc5de09a 100755 --- a/hack/after-build/verify-linkcheck.sh +++ b/hack/after-build/verify-linkcheck.sh @@ -24,15 +24,36 @@ source "${KUBE_ROOT}/hack/lib/init.sh" kube::golang::setup_env linkcheck=$(kube::util::find-binary "linkcheck") -TYPEROOT="${KUBE_ROOT}/pkg/api/" -"${linkcheck}" "--root-dir=${TYPEROOT}" "--repo-root=${KUBE_ROOT}" "--file-suffix=types.go" "--prefix=http://releases.k8s.io/HEAD" && ret=0 || ret=$? -if [[ $ret -eq 1 ]]; then - echo "links in ${TYPEROOT} is out of date." - exit 1 -fi -if [[ $ret -gt 1 ]]; then - echo "Error running linkcheck" - exit 1 +kube::util::ensure-temp-dir +OUTPUT="${KUBE_TEMP}"/linkcheck-output +cleanup() { + rm -rf "${OUTPUT}" +} +trap "cleanup" EXIT SIGINT +mkdir -p "$OUTPUT" + +APIROOT="${KUBE_ROOT}/pkg/api/" +APISROOT="${KUBE_ROOT}/pkg/apis/" +DOCROOT="${KUBE_ROOT}/docs/" +ROOTS=($APIROOT $APISROOT $DOCROOT) +found_invalid=false +for root in "${ROOTS[@]}"; do + "${linkcheck}" "--root-dir=${root}" 2> >(tee -a "${OUTPUT}/error" >&2) && ret=0 || ret=$? + if [[ $ret -eq 1 ]]; then + echo "Failed: found invalid links in ${root}." + found_invalid=true + fi + if [[ $ret -gt 1 ]]; then + echo "Error running linkcheck" + exit 1 + fi +done + +if [ ${found_invalid} = true ]; then + echo "Summary of invalid links:" + cat ${OUTPUT}/error fi +trap "cleanup" EXIT SIGINT + # ex: ts=2 sw=2 et filetype=sh diff --git a/hack/verify-all.sh b/hack/verify-all.sh index 99f151311f..6017f65502 100755 --- a/hack/verify-all.sh +++ b/hack/verify-all.sh @@ -60,7 +60,7 @@ if $SILENT ; then fi # remove protobuf until it is part of direct generation -EXCLUDE="verify-godeps.sh verify-godep-licenses.sh verify-generated-protobuf.sh" +EXCLUDE="verify-godeps.sh verify-godep-licenses.sh verify-generated-protobuf.sh verify-linkcheck.sh" ret=0 for t in `ls $KUBE_ROOT/hack/verify-*.sh` diff --git a/hooks/pre-commit b/hooks/pre-commit index 327c895417..d4234f78f7 100755 --- a/hooks/pre-commit +++ b/hooks/pre-commit @@ -122,16 +122,6 @@ else fi echo "${reset}" -echo -ne "Checking for links in API descriptions... " -if ! hack/after-build/verify-linkcheck.sh > /dev/null; then - echo "${red}ERROR!" - echo "Some links in pkg/api/.*types.go are outdated. They require a manual fix." - exit_code=1 -else - echo "${green}OK" -fi -echo "${reset}" - echo -ne "Checking for docs that need updating... " if ! hack/after-build/verify-generated-docs.sh > /dev/null; then echo "${red}ERROR!" diff --git a/shippable.yml b/shippable.yml index 7f6d23cb5a..0c43b9810c 100644 --- a/shippable.yml +++ b/shippable.yml @@ -56,7 +56,6 @@ install: - ./hack/verify-generated-docs.sh - ./hack/verify-generated-swagger-docs.sh - ./hack/verify-swagger-spec.sh - - ./hack/verify-linkcheck.sh script: # Disable coverage collection on pull requests