diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..ae1ecbc --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +.github +bin +.gitignore +.golangci.yaml +.goreleaser.yaml +CONTRIBUTING.md +LICENSE +README.md \ No newline at end of file diff --git a/.github/workflows/build-test.yml b/.github/workflows/build.yml similarity index 84% rename from .github/workflows/build-test.yml rename to .github/workflows/build.yml index b7e2b53..8447c01 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build.yml @@ -1,4 +1,4 @@ -name: 🔨 Build Test +name: 🔨 Build on: push: @@ -17,7 +17,7 @@ on: jobs: build: - name: Build Test + name: Build strategy: matrix: os: [ubuntu-latest, windows-latest, macOS-12] @@ -27,10 +27,10 @@ jobs: name: Set up Go uses: actions/setup-go@v4 with: - go-version: '>=1.20' + go-version: '>=1.23' - - name: Checkout the code - uses: actions/checkout@v3 + name: Checkout the repository + uses: actions/checkout@v4 with: fetch-depth: 0 - diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..5c29437 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,46 @@ +name: 🚨 Analyze Code (CodeQL) + +on: + push: + branches: + - "main" + paths: + - '**.go' + - '**.mod' + pull_request: + branches: + - "main" + paths: + - '**.go' + - '**.mod' + workflow_dispatch: + +jobs: + analyze: + name: Analyze Code (CodeQL) + strategy: + fail-fast: false + matrix: + language: [ 'go' ] + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + steps: + - + name: Checkout the repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - + name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + - + name: Autobuild + uses: github/codeql-action/autobuild@v3 + - + name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 \ No newline at end of file diff --git a/.github/workflows/dockerhub-push.yaml b/.github/workflows/dockerhub-push.yaml new file mode 100644 index 0000000..9ffe4ed --- /dev/null +++ b/.github/workflows/dockerhub-push.yaml @@ -0,0 +1,44 @@ +name: 🐳 DockerHub Push + +on: + workflow_run: + workflows: ["🎉 Release"] + types: + - completed + workflow_dispatch: + +jobs: + push: + name: DockerHub Push + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + attestations: write + id-token: write + steps: + - + name: Checkout + uses: actions/checkout@v4 + + - + name: Get Github tag + id: meta + run: | + curl --silent "https://api.github.com/repos/hueristiq/xcrawl3r/releases/latest" | jq -r .tag_name | xargs -I {} echo TAG={} >> $GITHUB_OUTPUT + + - + name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - + name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: true + tags: hueristiq/xcrawl3r:latest,hueristiq/xcrawl3r:${{ steps.meta.outputs.TAG }} diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint.yml similarity index 57% rename from .github/workflows/lint-test.yml rename to .github/workflows/lint.yml index 51d5f90..34d36e6 100644 --- a/.github/workflows/lint-test.yml +++ b/.github/workflows/lint.yml @@ -1,4 +1,4 @@ -name: 💅 Lint Test +name: 💅 Lint on: push: @@ -14,27 +14,30 @@ on: - '**.go' - '**.mod' workflow_dispatch: - + permissions: contents: read - + jobs: lint: - name: Lint Test + name: Lint runs-on: ubuntu-latest steps: - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: - go-version: '>=1.20' + go-version: '>=1.23' + cache: false - - name: Checkout code - uses: actions/checkout@v3 + name: Checkout the repository + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Run golangci-lint - uses: golangci/golangci-lint-action@v3 + uses: golangci/golangci-lint-action@v6 with: - version: v1.52.2 + version: v1.61.0 + args: --timeout 5m + working-directory: . \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 21da946..5e642f1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,11 +1,11 @@ -name: 🎉 release +name: 🎉 Release on: - create: - branches: - - main + push: tags: - - v*.*.* + - 'v*.*.*' + - '*.*.*' + workflow_dispatch: jobs: release: @@ -16,20 +16,21 @@ jobs: name: Set up Go uses: actions/setup-go@v4 with: - go-version: '>=1.20' + go-version: '>=1.23' - - name: Checkout code - uses: actions/checkout@v3 + name: Checkout the repository + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Run GoReleaser - uses: goreleaser/goreleaser-action@v4 + uses: goreleaser/goreleaser-action@v5 with: - args: "release --clean" + distribution: goreleaser version: latest + args: "release --clean" + workdir: . env: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - SLACK_WEBHOOK: "${{ secrets.SLACK_WEBHOOK }}" DISCORD_WEBHOOK_ID: "${{ secrets.DISCORD_WEBHOOK_ID }}" - DISCORD_WEBHOOK_TOKEN: "${{ secrets.DISCORD_WEBHOOK_TOKEN }}" + DISCORD_WEBHOOK_TOKEN: "${{ secrets.DISCORD_WEBHOOK_TOKEN }}" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 84bd03b..c5e82d7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1 @@ -# Executable - -cmd/xcrawl3r/xcrawl3r - -# Notes - -notes.txt \ No newline at end of file +bin \ No newline at end of file diff --git a/.golangci.yaml b/.golangci.yaml index b0b1ffd..0e6d4e6 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -1,61 +1,283 @@ +# Options for analysis running. run: + # Number of operating system threads (`GOMAXPROCS`) that can execute golangci-lint simultaneously. + # If it is explicitly set to 0 (i.e. not the default) then golangci-lint will automatically set the value to match Linux container CPU quota. + # Default: the number of logical CPUs in the machine + # concurrency: 4 + # Timeout for analysis, e.g. 30s, 5m. + # Default: 1m + timeout: 5m + # Exit code when at least one issue was found. + # Default: 1 issues-exit-code: 1 + # Include test files or not. + # Default: true + tests: true + # List of build tags, all linters use it. + # Default: [] + build-tags: [] + # If set, we pass it to "go list -mod={option}". From "go help modules": + # If invoked with -mod=readonly, the go command is disallowed from the implicit + # automatic updating of go.mod described above. Instead, it fails when any changes + # to go.mod are needed. This setting is most useful to check that go.mod does + # not need updates, such as in a continuous integration and testing system. + # If invoked with -mod=vendor, the go command assumes that the vendor + # directory holds the correct copies of dependencies and ignores + # the dependency descriptions in go.mod. + # + # Allowed values: readonly|vendor|mod + # Default: "" + modules-download-mode: readonly + # Allow multiple parallel golangci-lint instances running. + # If false, golangci-lint acquires file lock on start. + # Default: false + allow-parallel-runners: true + # Allow multiple golangci-lint instances running, but serialize them around a lock. + # If false, golangci-lint exits with an error if it fails to acquire file lock on start. + # Default: false + allow-serial-runners: true + # Define the Go version limit. + # Mainly related to generics support since go1.18. + # Default: use Go version from the go.mod file, fallback on the env var `GOVERSION`, fallback on 1.17 + go: '1.23' + +# output configuration options +output: + # The formats used to render issues. + # Formats: + # - `colored-line-number` + # - `line-number` + # - `json` + # - `colored-tab` + # - `tab` + # - `html` + # - `checkstyle` + # - `code-climate` + # - `junit-xml` + # - `junit-xml-extended` + # - `github-actions` + # - `teamcity` + # - `sarif` + # Output path can be either `stdout`, `stderr` or path to the file to write to. + # + # For the CLI flag (`--out-format`), multiple formats can be specified by separating them by comma. + # The output can be specified for each of them by separating format name and path by colon symbol. + # Example: "--out-format=checkstyle:report.xml,json:stdout,colored-line-number" + # The CLI flag (`--out-format`) override the configuration file. + # + # Default: + # formats: + # - format: colored-line-number + # path: stdout + formats: + # - + # format: json + # path: stderr + # - + # format: checkstyle + # path: report.xml + - + format: colored-line-number + path: stderr + # Print lines of code with issue. + # Default: true + print-issued-lines: true + # Print linter name in the end of issue text. + # Default: true + print-linter-name: true + # Make issues output unique by line. + # Default: true + uniq-by-line: false + # Add a prefix to the output file references. + # Default: "" + path-prefix: "" + # Sort results by the order defined in `sort-order`. + # Default: false + sort-results: true + # Order to use when sorting results. + # Require `sort-results` to `true`. + # Possible values: `file`, `linter`, and `severity`. + # + # If the severity values are inside the following list, they are ordered in this order: + # 1. error + # 2. warning + # 3. high + # 4. medium + # 5. low + # Either they are sorted alphabetically. + # + # Default: ["file"] + sort-order: + - linter + - severity + - file # filepath, line, and column. + # Show statistics per linter. + # Default: false + show-stats: false linters: + # Disable all linters. + # Default: false disable-all: true + # Enable specific linter + # https://golangci-lint.run/usage/linters/#enabled-by-default enable: + - asasalint + - asciicheck + - bidichk - bodyclose - - depguard + - canonicalheader + - containedctx + - contextcheck + - copyloopvar + # - cyclop + - decorder + # - depguard - dogsled - dupl + - dupword + - durationcheck + - err113 - errcheck - - exportloopref + - errchkjson + - errname + - errorlint - exhaustive + # - exhaustruct + - fatcontext + - forbidigo + - forcetypeassert + # - funlen + - gci + - ginkgolinter + - gocheckcompilerdirectives + # - gochecknoglobals + # - gochecknoinits + - gochecksumtype + # - gocognit - goconst - gocritic + # - gocyclo + - godot + - godox - gofmt + - gofumpt + - goheader - goimports - - gocyclo + - gomoddirectives + - gomodguard + - goprintffuncname - gosec - gosimple + - gosmopolitan - govet + - grouper + - importas + - inamedparam - ineffassign + - interfacebloat + - intrange + - ireturn + # - lll + - loggercheck + - maintidx + - makezero + - mirror - misspell + # - mnd + - musttag + # - nakedret + - nestif + - nilerr + - nilnil + - nlreturn + - noctx - nolintlint + # - nonamedreturns + - nosprintfhostport + - paralleltest + # - perfsprint - prealloc - predeclared + - promlinter + - protogetter + - reassign - revive + - rowserrcheck + - sloglint + - spancheck + - sqlclosecheck - staticcheck - stylecheck + - tagalign + # - tagliatelle + - tenv + - testableexamples + - testifylint + - testpackage - thelper - tparallel - - typecheck - unconvert - unparam - unused + - usestdlibvars + # - varnamelen + - wastedassign - whitespace + - wrapcheck - wsl + - zerologlint linters-settings: - errcheck: - check-type-assertions: true goconst: min-len: 2 min-occurrences: 3 gocritic: enabled-tags: - - style - - diagnostic - performance - experimental + - style - opinionated disabled-checks: - captLocal - - octalLiteral - govet: - check-shadowing: true - enable: - - fieldalignment - nolintlint: - require-explanation: true - require-specific: true \ No newline at end of file + - whyNoLint + gocyclo: + # Minimal code complexity to report. + # Default: 30 (but we recommend 10-20) + min-complexity: 10 + # varnamelen: + # # The minimum length of a variable's name that is considered "long". + # # Variable names that are at least this long will be ignored. + # # Default: 3 + # min-name-length: 2 + # # Check method receivers. + # # Default: false + # check-receiver: true + # # Check named return values. + # # Default: false + # check-return: true + # # Check type parameters. + # # Default: false + # check-type-param: true + whitespace: + # Enforces newlines (or comments) after every multi-line if statement. + # Default: false + multi-if: true + # Enforces newlines (or comments) after every multi-line function signature. + # Default: false + multi-func: true + +issues: + # Which dirs to exclude: issues from them won't be reported. + # Can use regexp here: `generated.*`, regexp is applied on full path, + # including the path prefix if one is set. + # Default dirs are skipped independently of this option's value (see exclude-dirs-use-default). + # "/" will be replaced by current OS file path separator to properly work on Windows. + # Default: [] + exclude-dirs: [] + # Show issues in any part of update files (requires new-from-rev or new-from-patch). + # Default: false + whole-files: false + # Fix found issues (if it's supported by the linter). + # Default: false + fix: true \ No newline at end of file diff --git a/.goreleaser.yaml b/.goreleaser.yaml index d15d87c..ca203b6 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -37,10 +37,9 @@ builds: archives: - id: tgz - builds: [xcrawl3r-cli] + builds: + - xcrawl3r-cli format: tar.gz - replacements: - darwin: macOS format_overrides: - goos: windows @@ -50,12 +49,6 @@ checksum: algorithm: sha256 announce: - slack: - enabled: true - channel: '#release' - username: GoReleaser - message_template: 'New Release: {{ .ProjectName }} {{.Tag}} is published! Check it out at {{ .ReleaseURL }}' - discord: enabled: true message_template: '**New Release: {{ .ProjectName }} {{.Tag}}** is published! Check it out at {{ .ReleaseURL }}' \ No newline at end of file diff --git a/.vscode/extenstions.json b/.vscode/extenstions.json new file mode 100644 index 0000000..7203cb3 --- /dev/null +++ b/.vscode/extenstions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "golang.go" + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1a653cd --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "go.lintTool": "golangci-lint", + "go.lintFlags": [ + "--fast" + ] +} \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3027da0..e9a0b69 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,14 +41,14 @@ Pull requests should target the `dev` branch. Please also reference the issue fr When submitting code, please make every effort to follow existing conventions and style in order to keep the code as readable as possible. Here are a few points to keep in mind: -* Please run `go fmt ./...` before committing to ensure code aligns with go standards. -* We use [`golangci-lint`](https://golangci-lint.run/) for linting Go code, run `golangci-lint run --fix` before submitting PR. Editors such as Visual Studio Code or JetBrains IntelliJ; with Go support plugin will offer `golangci-lint` automatically. * All dependencies must be defined in the `go.mod` file. * Advanced IDEs and code editors (like VSCode) will take care of that, but to be sure, run `go mod tidy` to validate dependencies. +* Please run `go fmt ./...` before committing to ensure code aligns with go standards. +* We use [`golangci-lint`](https://golangci-lint.run/) for linting Go code, run `golangci-lint run --fix` before submitting PR. Editors such as Visual Studio Code or JetBrains IntelliJ; with Go support plugin will offer `golangci-lint` automatically. * For details on the approved style, check out [Effective Go](https://golang.org/doc/effective_go.html). ### License -By contributing your code, you agree to license your contribution under the terms of the [MIT License](./LICENSE). +By contributing your code, you agree to license your contribution under the terms of the [MIT License](https://github.com/hueristiq/xcrawl3r/blob/master/LICENSE). All files are released with the MIT license. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6a6c55f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,67 @@ +# Use the official Golang image version 1.23 with the Alpine distribution as the base image for the build stage. +# This multi-stage build starts with the "build-stage" stage where the Go application will be compiled. +FROM golang:1.23.1-alpine3.20 AS build-stage + +# Perform system updates and install necessary packages. +# - `apk --no-cache update`: Updates the Alpine package repository without caching index files. +# - `apk --no-cache upgrade`: Upgrades all installed packages to the latest available versions. +# - `apk --no-cache add`: Installs additional required packages: +# - `ca-certificates`: For managing CA certificates for secure communication. +# - `curl`: For making HTTP requests (can be used to download files or for health checks). +# - `gcc` and `g++`: The GNU Compiler Collection used for compiling C and C++ code, essential for building Go applications. +# - `git`: Required for downloading Go modules that reference external repositories. +# - `make`: Utility for automating build processes and running the `Makefile`. +RUN <-linux-amd64.tar.gz ``` -> **TIP:** The above steps, download and extract, can be combined into a single step with this onliner +> [!TIP] +> The above steps, download and extract, can be combined into a single step with this onliner > > ```bash > curl -sL https://github.com/hueristiq/xcrawl3r/releases/download/v/xcrawl3r--linux-amd64.tar.gz | tar -xzv > ``` -**NOTE:** On Windows systems, you should be able to double-click the zip archive to extract the `xcrawl3r` executable. +> [!NOTE] +> On Windows systems, you should be able to double-click the zip archive to extract the `xcrawl3r` executable. ...move the `xcrawl3r` binary to somewhere in your `PATH`. For example, on GNU/Linux and OS X systems: @@ -64,7 +69,8 @@ tar xf xcrawl3r--linux-amd64.tar.gz sudo mv xcrawl3r /usr/local/bin/ ``` -**NOTE:** Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. +> [!NOTE] +> Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. ### Install source (With Go Installed) @@ -97,80 +103,105 @@ go install -v github.com/hueristiq/xcrawl3r/cmd/xcrawl3r@latest sudo mv xcrawl3r /usr/local/bin/ ``` - **NOTE:** Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. + Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. -**NOTE:** While the development version is a good way to take a peek at `xcrawl3r`'s latest features before they get released, be aware that it may have bugs. Officially released versions will generally be more stable. +> [!CAUTION] +> While the development version is a good way to take a peek at `xcrawl3r`'s latest features before they get released, be aware that it may have bugs. Officially released versions will generally be more stable. + +### Install on Docker (With Docker Installed) + +To install `xcrawl3r` on docker: + +* Pull the docker image using: + + ```bash + docker pull hueristiq/xcrawl3r:latest + ``` + +* Run `xcrawl3r` using the image: + + ```bash + docker run --rm hueristiq/xcrawl3r:latest -h + ``` ## Usage -To display help message for `xcrawl3r` use the `-h` flag: +To start using `xcrawl3r`, open your terminal and run the following command for a list of options: ```bash xcrawl3r -h ``` -help message: +Here's what the help message looks like: ```text - _ _____ -__ _____ _ __ __ ___ _| |___ / _ __ -\ \/ / __| '__/ _` \ \ /\ / / | |_ \| '__| - > < (__| | | (_| |\ V V /| |___) | | -/_/\_\___|_| \__,_| \_/\_/ |_|____/|_| v0.1.0 -A CLI utility to recursively crawl webpages. + _ _____ +__ _____ _ __ __ ___ _| |___ / _ __ +\ \/ / __| '__/ _` \ \ /\ / / | |_ \| '__| + > < (__| | | (_| |\ V V /| |___) | | +/_/\_\___|_| \__,_| \_/\_/ |_|____/|_| + v0.2.0 USAGE: xcrawl3r [OPTIONS] INPUT: - -d, --domain string domain to match URLs - --include-subdomains bool match subdomains' URLs - -s, --seeds string seed URLs file (use `-` to get from stdin) - -u, --url string URL to crawl + -d, --domain string domain to match URLs + --include-subdomains bool match subdomains' URLs + -s, --seeds string seed URLs file (use `-` to get from stdin) + -u, --url string URL to crawl CONFIGURATION: - --depth int maximum depth to crawl (default 3) - TIP: set it to `0` for infinite recursion - --headless bool If true the browser will be displayed while crawling. - -H, --headers string[] custom header to include in requests - e.g. -H 'Referer: http://example.com/' - TIP: use multiple flag to set multiple headers - --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080) - TIP: use multiple flag to set multiple proxies - --render bool utilize a headless chrome instance to render pages - --timeout int time to wait for request in seconds (default: 10) - --user-agent string User Agent to use (default: web) - TIP: use `web` for a random web user-agent, - `mobile` for a random mobile user-agent, - or you can set your specific user-agent. + --depth int maximum depth to crawl (default 3) + TIP: set it to `0` for infinite recursion + --headless bool If true the browser will be displayed while crawling. + -H, --headers string[] custom header to include in requests + e.g. -H 'Referer: http://example.com/' + TIP: use multiple flag to set multiple headers + --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080) + TIP: use multiple flag to set multiple proxies + --render bool utilize a headless chrome instance to render pages + --timeout int time to wait for request in seconds (default: 10) + --user-agent string User Agent to use (default: xcrawl3r v0.2.0 (https://github.com/hueristiq/xcrawl3r)) + TIP: use `web` for a random web user-agent, + `mobile` for a random mobile user-agent, + or you can set your specific user-agent. RATE LIMIT: - -c, --concurrency int number of concurrent fetchers to use (default 10) - --delay int delay between each request in seconds - --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s) - -p, --parallelism int number of concurrent URLs to process (default: 10) + -c, --concurrency int number of concurrent fetchers to use (default 10) + --delay int delay between each request in seconds + --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s) + -p, --parallelism int number of concurrent URLs to process (default: 10) OUTPUT: - --debug bool enable debug mode (default: false) - -m, --monochrome bool coloring: no colored output mode - -o, --output string output file to write found URLs - -v, --verbosity string debug, info, warning, error, fatal or silent (default: debug) + --debug bool enable debug mode (default: false) + -m, --monochrome bool coloring: no colored output mode + -o, --output string output file to write found URLs + --silent bool display output URLs only + -v, --verbose bool display verbose output ``` ## Contributing -[Issues](https://github.com/hueristiq/xcrawl3r/issues) and [Pull Requests](https://github.com/hueristiq/xcrawl3r/pulls) are welcome! **Check out the [contribution guidelines](./CONTRIBUTING.md).** +We welcome contributions! Feel free to submit [Pull Requests](https://github.com/hueristiq/xcrawl3r/pulls) or report [Issues](https://github.com/hueristiq/xcrawl3r/issues). For more details, check out the [contribution guidelines](https://github.com/hueristiq/xcrawl3r/blob/master/CONTRIBUTING.md). ## Licensing -This utility is distributed under the [MIT license](./LICENSE). - +This utility is licensed under the [MIT license](https://opensource.org/license/mit). You are free to use, modify, and distribute it, as long as you follow the terms of the license. You can find the full license text in the repository - [Full MIT license text](https://github.com/hueristiq/xcrawl3r/blob/master/LICENSE). ## Credits -* Alternatives - Check out projects below, that may fit in your workflow: +### Contributors + +A huge thanks to all the contributors who have helped make `xcrawl3r` what it is today! + +[![contributors](https://contrib.rocks/image?repo=hueristiq/xcrawl3r&max=500)](https://github.com/hueristiq/xcrawl3r/graphs/contributors) + +### Similar Projects + +If you're interested in more utilities like this, check out: - [katana](https://github.com/projectdiscovery/katana) ◇ [gospider](https://github.com/jaeles-project/gospider) ◇ [hakrawler](https://github.com/hakluke/hakrawler) ◇ [urlgrab](https://github.com/IAmStoxe/urlgrab) \ No newline at end of file +[gospider](https://github.com/jaeles-project/gospider) ◇ [hakrawler](https://github.com/hakluke/hakrawler) ◇ [katana](https://github.com/projectdiscovery/katana) ◇ [urlgrab](https://github.com/IAmStoxe/urlgrab) \ No newline at end of file diff --git a/cmd/xcrawl3r/main.go b/cmd/xcrawl3r/main.go index 9be1003..368870d 100644 --- a/cmd/xcrawl3r/main.go +++ b/cmd/xcrawl3r/main.go @@ -6,11 +6,12 @@ import ( "io/fs" "os" "path/filepath" + "strings" - hqlog "github.com/hueristiq/hqgoutils/log" - "github.com/hueristiq/hqgoutils/log/formatter" - "github.com/hueristiq/hqgoutils/log/levels" - hqurl "github.com/hueristiq/hqgoutils/url" + hqgourl "github.com/hueristiq/hq-go-url" + "github.com/hueristiq/hqgolog" + "github.com/hueristiq/hqgolog/formatter" + "github.com/hueristiq/hqgolog/levels" "github.com/hueristiq/xcrawl3r/internal/configuration" "github.com/hueristiq/xcrawl3r/pkg/xcrawl3r" "github.com/logrusorgru/aurora/v3" @@ -41,7 +42,9 @@ var ( debug bool monochrome bool output string - verbosity string + + silent bool + verbose bool ) func init() { @@ -57,7 +60,7 @@ func init() { pflag.StringSliceVar(&proxies, "proxy", []string{}, "") pflag.BoolVar(&render, "render", false, "") pflag.IntVar(&timeout, "timeout", 10, "") - pflag.StringVar(&userAgent, "user-agent", "web", "") + pflag.StringVar(&userAgent, "user-agent", xcrawl3r.DefaultUserAgent, "") pflag.IntVarP(&concurrency, "concurrency", "c", 10, "") pflag.IntVar(&delay, "delay", 0, "") @@ -67,48 +70,51 @@ func init() { pflag.BoolVar(&debug, "debug", false, "") pflag.BoolVarP(&monochrome, "monochrome", "m", false, "") pflag.StringVarP(&output, "output", "o", "", "") - pflag.StringVarP(&verbosity, "verbosity", "v", string(levels.LevelInfo), "") + + pflag.BoolVar(&silent, "silent", false, "") + pflag.BoolVarP(&verbose, "verbose", "v", false, "") pflag.CommandLine.SortFlags = false pflag.Usage = func() { fmt.Fprintln(os.Stderr, configuration.BANNER) - h := "USAGE:\n" + h := "\nUSAGE:\n" h += " xcrawl3r [OPTIONS]\n" h += "\nINPUT:\n" - h += " -d, --domain string domain to match URLs\n" - h += " --include-subdomains bool match subdomains' URLs\n" - h += " -s, --seeds string seed URLs file (use `-` to get from stdin)\n" - h += " -u, --url string URL to crawl\n" + h += " -d, --domain string domain to match URLs\n" + h += " --include-subdomains bool match subdomains' URLs\n" + h += " -s, --seeds string seed URLs file (use `-` to get from stdin)\n" + h += " -u, --url string URL to crawl\n" h += "\nCONFIGURATION:\n" - h += " --depth int maximum depth to crawl (default 3)\n" - h += " TIP: set it to `0` for infinite recursion\n" - h += " --headless bool If true the browser will be displayed while crawling.\n" - h += " -H, --headers string[] custom header to include in requests\n" - h += " e.g. -H 'Referer: http://example.com/'\n" - h += " TIP: use multiple flag to set multiple headers\n" - h += " --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080)\n" - h += " TIP: use multiple flag to set multiple proxies\n" - h += " --render bool utilize a headless chrome instance to render pages\n" - h += " --timeout int time to wait for request in seconds (default: 10)\n" - h += " --user-agent string User Agent to use (default: web)\n" - h += " TIP: use `web` for a random web user-agent,\n" - h += " `mobile` for a random mobile user-agent,\n" - h += " or you can set your specific user-agent.\n" + h += " --depth int maximum depth to crawl (default 3)\n" + h += " TIP: set it to `0` for infinite recursion\n" + h += " --headless bool If true the browser will be displayed while crawling.\n" + h += " -H, --headers string[] custom header to include in requests\n" + h += " e.g. -H 'Referer: http://example.com/'\n" + h += " TIP: use multiple flag to set multiple headers\n" + h += " --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080)\n" + h += " TIP: use multiple flag to set multiple proxies\n" + h += " --render bool utilize a headless chrome instance to render pages\n" + h += " --timeout int time to wait for request in seconds (default: 10)\n" + h += fmt.Sprintf(" --user-agent string User Agent to use (default: %s)\n", xcrawl3r.DefaultUserAgent) + h += " TIP: use `web` for a random web user-agent,\n" + h += " `mobile` for a random mobile user-agent,\n" + h += " or you can set your specific user-agent.\n" h += "\nRATE LIMIT:\n" - h += " -c, --concurrency int number of concurrent fetchers to use (default 10)\n" - h += " --delay int delay between each request in seconds\n" - h += " --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s)\n" - h += " -p, --parallelism int number of concurrent URLs to process (default: 10)\n" + h += " -c, --concurrency int number of concurrent fetchers to use (default 10)\n" + h += " --delay int delay between each request in seconds\n" + h += " --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s)\n" + h += " -p, --parallelism int number of concurrent URLs to process (default: 10)\n" h += "\nOUTPUT:\n" - h += " --debug bool enable debug mode (default: false)\n" - h += " -m, --monochrome bool coloring: no colored output mode\n" - h += " -o, --output string output file to write found URLs\n" - h += " -v, --verbosity string debug, info, warning, error, fatal or silent (default: debug)\n" + h += " --debug bool enable debug mode (default: false)\n" + h += " -m, --monochrome bool coloring: no colored output mode\n" + h += " -o, --output string output file to write found URLs\n" + h += " --silent bool display output URLs only\n" + h += " -v, --verbose bool display verbose output\n" fmt.Fprint(os.Stderr, h) } @@ -116,8 +122,13 @@ func init() { pflag.Parse() // Initialize logger - hqlog.DefaultLogger.SetMaxLevel(levels.LevelStr(verbosity)) - hqlog.DefaultLogger.SetFormatter(formatter.NewCLI(&formatter.CLIOptions{ + hqgolog.DefaultLogger.SetMaxLevel(levels.LevelInfo) + + if verbose { + hqgolog.DefaultLogger.SetMaxLevel(levels.LevelDebug) + } + + hqgolog.DefaultLogger.SetFormatter(formatter.NewCLI(&formatter.CLIOptions{ Colorize: !monochrome, })) @@ -125,14 +136,18 @@ func init() { } func main() { - if verbosity != string(levels.LevelSilent) { + if !silent { fmt.Fprintln(os.Stderr, configuration.BANNER) } + hqgolog.Print().Msg("") + if seedsFile != "" && URL == "" && domain == "" { - hqlog.Fatal().Msg("using `-s, --seeds` requires either `-d, --domain` or `-u, --url` to be set!") + hqgolog.Fatal().Msg("using `-s, --seeds` requires either `-d, --domain` or `-u, --url` to be set!") } + up := hqgourl.NewParser() + // Load input URLs seeds := []string{} @@ -140,7 +155,13 @@ func main() { seeds = append(seeds, URL) if domain == "" { - domain = URL + parsed, err := up.Parse(URL) + if err != nil { + hqgolog.Fatal().Msg(err.Error()) + } + + domain = parsed.Domain.String() + domain = strings.TrimPrefix(domain, "www.") } } @@ -155,21 +176,21 @@ func main() { case seedsFile != "" && seedsFile == "-": stat, err = os.Stdin.Stat() if err != nil { - hqlog.Fatal().Msg("no stdin") + hqgolog.Fatal().Msg("no stdin") } if stat.Mode()&os.ModeNamedPipe == 0 { - hqlog.Fatal().Msg("no stdin") + hqgolog.Fatal().Msg("no stdin") } file = os.Stdin case seedsFile != "" && seedsFile != "-": file, err = os.Open(seedsFile) if err != nil { - hqlog.Fatal().Msg(err.Error()) + hqgolog.Fatal().Msg(err.Error()) } default: - hqlog.Fatal().Msg("xcrawl3r takes input from stdin or file using a flag") + hqgolog.Fatal().Msg("xcrawl3r takes input from stdin or file using a flag") } scanner := bufio.NewScanner(file) @@ -183,17 +204,12 @@ func main() { } if scanner.Err() != nil { - hqlog.Fatal().Msgf("%s", err) + hqgolog.Fatal().Msgf("%s", err) } } - parsedURL, err := hqurl.Parse(domain) - if err != nil { - hqlog.Fatal().Msgf("%s", err) - } - - options := &xcrawl3r.Options{ - Domain: parsedURL.Domain, + cfg := &xcrawl3r.Configuration{ + Domain: domain, IncludeSubdomains: includeSubdomains, Seeds: seeds, @@ -213,50 +229,53 @@ func main() { Debug: debug, } - crawler, err := xcrawl3r.New(options) + crawler, err := xcrawl3r.New(cfg) if err != nil { - hqlog.Fatal().Msgf("%s", err) + hqgolog.Fatal().Msg(err.Error()) } - URLs := crawler.Crawl() + var writer *bufio.Writer if output != "" { directory := filepath.Dir(output) if _, err := os.Stat(directory); os.IsNotExist(err) { if err = os.MkdirAll(directory, os.ModePerm); err != nil { - hqlog.Fatal().Msg(err.Error()) + hqgolog.Fatal().Msg(err.Error()) } } - file, err := os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + var file *os.File + + file, err = os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) if err != nil { - hqlog.Fatal().Msg(err.Error()) + hqgolog.Fatal().Msg(err.Error()) } defer file.Close() - writer := bufio.NewWriter(file) + writer = bufio.NewWriter(file) + } - for outputURL := range URLs { - if verbosity == string(levels.LevelSilent) { - hqlog.Print().Msg(outputURL.Value) + for URL := range crawler.Crawl() { + switch URL.Type { + case xcrawl3r.ResultError: + if verbose { + hqgolog.Error().Msgf("%s: %s\n", URL.Source, URL.Error) + } + case xcrawl3r.ResultURL: + if verbose { + hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(URL.Source), URL.Value) } else { - hqlog.Print().Msgf("[%s] %s", au.BrightBlue(outputURL.Source), outputURL.Value) + hqgolog.Print().Msg(URL.Value) } - fmt.Fprintln(writer, outputURL.Value) - } + if writer != nil { + fmt.Fprintln(writer, URL.Value) - if err = writer.Flush(); err != nil { - hqlog.Fatal().Msg(err.Error()) - } - } else { - for outputURL := range URLs { - if verbosity == string(levels.LevelSilent) { - hqlog.Print().Msg(outputURL.Value) - } else { - hqlog.Print().Msgf("[%s] %s", au.BrightBlue(outputURL.Source), outputURL.Value) + if err := writer.Flush(); err != nil { + hqgolog.Fatal().Msg(err.Error()) + } } } } diff --git a/go.mod b/go.mod index 9f3aad5..5771bd1 100644 --- a/go.mod +++ b/go.mod @@ -1,39 +1,43 @@ module github.com/hueristiq/xcrawl3r -go 1.20 +go 1.23.1 require ( - github.com/chromedp/chromedp v0.9.1 + github.com/chromedp/chromedp v0.11.0 github.com/gocolly/colly/v2 v2.1.0 - github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8 + github.com/hueristiq/hq-go-http v0.0.0-20241020113552-532feebd5687 + github.com/hueristiq/hq-go-url v0.0.0-20241020144539-a9e1f60005ea + github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f github.com/logrusorgru/aurora/v3 v3.0.0 - github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 github.com/spf13/pflag v1.0.5 ) require ( - github.com/PuerkitoBio/goquery v1.5.1 // indirect - github.com/andybalholm/cascadia v1.2.0 // indirect - github.com/antchfx/htmlquery v1.2.3 // indirect - github.com/antchfx/xmlquery v1.2.4 // indirect - github.com/antchfx/xpath v1.1.8 // indirect - github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9 // indirect - github.com/chromedp/sysutil v1.0.0 // indirect + github.com/Mzack9999/go-http-digest-auth-client v0.6.0 // indirect + github.com/PuerkitoBio/goquery v1.10.0 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/antchfx/htmlquery v1.3.3 // indirect + github.com/antchfx/xmlquery v1.4.2 // indirect + github.com/antchfx/xpath v1.3.2 // indirect + github.com/chromedp/cdproto v0.0.0-20241014181340-cb3a7a1d51d7 // indirect + github.com/chromedp/sysutil v1.1.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect - github.com/gobwas/ws v1.1.0 // indirect - github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect - github.com/golang/protobuf v1.4.2 // indirect + github.com/gobwas/ws v1.4.0 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/hueristiq/hq-go-retrier v0.0.0-20241020110813-ef8a550b01d5 // indirect + github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect github.com/mailru/easyjson v0.7.7 // indirect - github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect - github.com/temoto/robotstxt v1.1.1 // indirect - golang.org/x/net v0.10.0 // indirect - golang.org/x/sys v0.8.0 // indirect - golang.org/x/term v0.8.0 // indirect - golang.org/x/text v0.9.0 // indirect - google.golang.org/appengine v1.6.6 // indirect - google.golang.org/protobuf v1.24.0 // indirect + github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + golang.org/x/net v0.30.0 // indirect + golang.org/x/sys v0.26.0 // indirect + golang.org/x/term v0.25.0 // indirect + golang.org/x/text v0.19.0 // indirect + google.golang.org/appengine v1.6.8 // indirect + google.golang.org/protobuf v1.35.1 // indirect ) diff --git a/go.sum b/go.sum index eb4465a..0839daa 100644 --- a/go.sum +++ b/go.sum @@ -1,24 +1,33 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= +github.com/Mzack9999/go-http-digest-auth-client v0.6.0 h1:LXVNMsj7qiNVmlZByFbjJmXf6SOm/uoo04XmnNcWPms= +github.com/Mzack9999/go-http-digest-auth-client v0.6.0/go.mod h1:gbwaYYXwA15ZfIxMyY5QU1acATDyNKEuG5TylBCL7AM= github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4= +github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= -github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= -github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= -github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4= +github.com/antchfx/htmlquery v1.3.3 h1:x6tVzrRhVNfECDaVxnZi1mEGrQg3mjE/rxbH2Pe6dNE= +github.com/antchfx/htmlquery v1.3.3/go.mod h1:WeU3N7/rL6mb6dCwtE30dURBnBieKDC/fR8t6X+cKjU= github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM= +github.com/antchfx/xmlquery v1.4.2 h1:MZKd9+wblwxfQ1zd1AdrTsqVaMjMCwow3IqkCSe00KA= +github.com/antchfx/xmlquery v1.4.2/go.mod h1:QXhvf5ldTuGqhd1SHNvvtlhhdQLks4dD0awIVhXIDTA= github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= -github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk= github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U= +github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9 h1:wMSvdj3BswqfQOXp2R1bJOAE7xIQLt2dlMQDMf836VY= -github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= -github.com/chromedp/chromedp v0.9.1 h1:CC7cC5p1BeLiiS2gfNNPwp3OaUxtRMBjfiw3E3k6dFA= -github.com/chromedp/chromedp v0.9.1/go.mod h1:DUgZWRvYoEfgi66CgZ/9Yv+psgi+Sksy5DTScENWjaQ= -github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= +github.com/chromedp/cdproto v0.0.0-20241003230502-a4a8f7c660df/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/cdproto v0.0.0-20241014181340-cb3a7a1d51d7 h1:VDBgUGgdCBw9lTKwp0KPExhnqmGfGVJQTER2MehoICk= +github.com/chromedp/cdproto v0.0.0-20241014181340-cb3a7a1d51d7/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/chromedp v0.11.0 h1:1PT6O4g39sBAFjlljIHTpxmCSk8meeYL6+R+oXH4bWA= +github.com/chromedp/chromedp v0.11.0/go.mod h1:jsD7OHrX0Qmskqb5Y4fn4jHnqquqW22rkMFgKbECsqg= github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= +github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM= +github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -31,14 +40,15 @@ github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= -github.com/gobwas/ws v1.1.0 h1:7RFti/xnNkMJnrK7D1yQ/iCIB5OrrY/54/H930kIbHA= -github.com/gobwas/ws v1.1.0/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0= +github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= +github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs= github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -49,15 +59,27 @@ github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrU github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= -github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8 h1:Y5Hsbpr9c5oK2l/ktfWdPOLBOx9MPlH7vRQNK1mJmiQ= -github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8/go.mod h1:GxYwOCC1RrHbilApc+wccYiaABLlRnnYaHcobxNDHos= +github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/hueristiq/hq-go-http v0.0.0-20241020113552-532feebd5687 h1:wbtQCCbsyYpI22jE6f7MH979yNpvMPy0vertuYq32p0= +github.com/hueristiq/hq-go-http v0.0.0-20241020113552-532feebd5687/go.mod h1:4cIeUJTM4gt2NgJ4jOZePenVWaY8337wB1pvsK6sYDs= +github.com/hueristiq/hq-go-retrier v0.0.0-20241020110813-ef8a550b01d5 h1:uSIqfeqkXZI/QciepvLVduqbU7Rq+jr+At0ENVjPIN4= +github.com/hueristiq/hq-go-retrier v0.0.0-20241020110813-ef8a550b01d5/go.mod h1:YkxIHoJHsL0wmzQ3tc0qz4UTr9q9eCicUt5RvMV//xw= +github.com/hueristiq/hq-go-url v0.0.0-20241020144539-a9e1f60005ea h1:aFUvZ+Bnae4Coo97oThYy6OmuIwEkrQNGesMzAidedc= +github.com/hueristiq/hq-go-url v0.0.0-20241020144539-a9e1f60005ea/go.mod h1:1q7KVF3MOodsQzUkWwDwqn62L0Yjj8nLDSqZF0oirgQ= +github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f h1:JAgZOIJ+UbkENpRiOTlfg51CW0UNrUkgwLjUGiH+x9g= +github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f/go.mod h1:S5J3E3Azva5+JKv67uc+Hh3XwLDvkVYDGjEaMTFrIqg= +github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 h1:dpHAa9c74HgAXkZ2WPd84q2cCiF76eluuSGRw7bk7To= +github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440/go.mod h1:NlZ117o///yWDbRAbgYD7/Y44qce8z1Dj4caUsjunSY= github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= @@ -71,27 +93,34 @@ github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= -github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 h1:2vmb32OdDhjZf2ETGDlr9n8RYXx7c+jXPxMiPbwnA+8= -github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4/go.mod h1:2JQx4jDHmWrbABvpOayg/+OTU6ehN0IyK2EHzceXpJo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= -github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -99,41 +128,68 @@ golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= +golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201207223542-d4d67f95c62d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.8.0 h1:n5xxQn2i3PC0yLAbjTpNT85q/Kgzcr2gIoX9OrJUols= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc= google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= +google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= @@ -148,7 +204,12 @@ google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzi google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA= google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/internal/configuration/configuration.go b/internal/configuration/configuration.go index 9ec9597..ada4d5d 100644 --- a/internal/configuration/configuration.go +++ b/internal/configuration/configuration.go @@ -3,23 +3,17 @@ package configuration import "github.com/logrusorgru/aurora/v3" const ( - NAME string = "xcrawl3r" - VERSION string = "0.1.0" - DESCRIPTION string = "A CLI utility to recursively crawl webpages." + NAME = "xcrawl3r" + VERSION = "0.2.0" ) -var ( - BANNER = aurora.Sprintf( - aurora.BrightBlue(` +var BANNER = aurora.Sprintf( + aurora.BrightBlue(` _ _____ __ _____ _ __ __ ___ _| |___ / _ __ \ \/ / __| '__/ _`+"`"+` \ \ /\ / / | |_ \| '__| > < (__| | | (_| |\ V V /| |___) | | -/_/\_\___|_| \__,_| \_/\_/ |_|____/|_| %s - -%s -`).Bold(), - aurora.BrightYellow("v"+VERSION).Bold(), - aurora.BrightGreen(DESCRIPTION).Italic(), - ) +/_/\_\___|_| \__,_| \_/\_/ |_|____/|_| + %s`).Bold(), + aurora.BrightRed("v"+VERSION).Bold(), ) diff --git a/pkg/browser/browser.go b/pkg/browser/browser.go index 1037d10..395e590 100644 --- a/pkg/browser/browser.go +++ b/pkg/browser/browser.go @@ -5,29 +5,30 @@ import ( "log" "github.com/chromedp/chromedp" - hqlog "github.com/hueristiq/hqgoutils/log" + "github.com/hueristiq/hqgolog" ) -var GlobalContext context.Context -var GlobalCancel context.CancelFunc +var ( + GlobalContext context.Context + GlobalCancel context.CancelFunc +) func GetRenderedSource(url string) (outerHTML string) { // same browser, second tab newCtx, newCtxCancel := chromedp.NewContext(GlobalContext) + defer newCtxCancel() // ensure the second tab is created if err := chromedp.Run(newCtx); err != nil { newCtxCancel() - hqlog.Fatal().Msg(err.Error()) + + hqgolog.Fatal().Msg(err.Error()) } // navigate to a page, and get it's entire HTML - if err := chromedp.Run(newCtx, - chromedp.Navigate(url), - chromedp.OuterHTML("html", &outerHTML), - ); err != nil { - hqlog.Error().Msg(err.Error()) + if err := chromedp.Run(newCtx, chromedp.Navigate(url), chromedp.OuterHTML("html", &outerHTML)); err != nil { + hqgolog.Error().Msg(err.Error()) } return @@ -52,7 +53,7 @@ func GetGlobalContext(headless bool, proxy string) (ctx context.Context, cancel // ensure the first tab is created if err := chromedp.Run(ctx); err != nil { - hqlog.Fatal().Msg(err.Error()) + hqgolog.Fatal().Msg(err.Error()) } return diff --git a/pkg/parser/sitemap/sitemap.go b/pkg/parser/sitemap/sitemap.go new file mode 100644 index 0000000..7f97af2 --- /dev/null +++ b/pkg/parser/sitemap/sitemap.go @@ -0,0 +1,154 @@ +package sitemap + +import ( + "encoding/xml" + "errors" + "io" +) + +type entry struct { + Type EntryType + Location string `xml:"loc"` + LastModified string `xml:"lastmod,omitempy"` + ChangeFrequency EntryChangeFrequency `xml:"changefreq,omitempty"` + Priority float32 `xml:"priority,omitempty"` +} + +func (e *entry) GetType() EntryType { + return e.Type +} + +func (e *entry) GetLocation() string { + return e.Location +} + +func (e *entry) GetChangeFrequency() EntryChangeFrequency { + return e.ChangeFrequency +} + +func (e *entry) GetPriority() float32 { + return e.Priority +} + +type EntryType string + +func (t EntryType) String() (entryType string) { + entryType = string(t) + + return +} + +type EntryChangeFrequency string + +func (f EntryChangeFrequency) String() (entryChangeFrequency string) { + entryChangeFrequency = string(f) + + return +} + +type Consumer func(entry Entry) (err error) + +type elementParser func(*xml.Decoder, *xml.StartElement) error + +type Entry interface { + GetType() EntryType + GetLocation() string + GetChangeFrequency() EntryChangeFrequency + GetPriority() float32 +} + +const ( + EntryTypeSitemap EntryType = "sitemap" + EntryTypeURL EntryType = "url" + + EntryChangeFrequencyAlways EntryChangeFrequency = "always" + EntryChangeFrequencyHourly EntryChangeFrequency = "hourly" + EntryChangeFrequencyDaily EntryChangeFrequency = "daily" + EntryChangeFrequencyWeekly EntryChangeFrequency = "weekly" + EntryChangeFrequencyMonthly EntryChangeFrequency = "monthly" + EntryChangeFrequencyYearly EntryChangeFrequency = "yearly" + EntryChangeFrequencyNever EntryChangeFrequency = "never" +) + +func Parse(reader io.Reader, consumer Consumer) (err error) { + return parseLoop(reader, func(d *xml.Decoder, se *xml.StartElement) (err error) { + return entryParser(d, se, consumer) + }) +} + +func entryParser(decoder *xml.Decoder, se *xml.StartElement, consume Consumer) (err error) { + if se.Name.Local == "url" { + entry := newURLEntry() + + if err = decoder.DecodeElement(entry, se); err != nil { + return + } + + if err = consume(entry); err != nil { + return + } + } + + if se.Name.Local == "sitemap" { + entry := newSitemapEntry() + + if err = decoder.DecodeElement(entry, se); err != nil { + return + } + + if err = consume(entry); err != nil { + return + } + } + + return +} + +func newURLEntry() (instance *entry) { + instance = &entry{ + Type: EntryTypeURL, + ChangeFrequency: EntryChangeFrequencyAlways, + Priority: 0.5, + } + + return +} + +func newSitemapEntry() (instance *entry) { + instance = &entry{ + Type: EntryTypeSitemap, + } + + return +} + +func parseLoop(reader io.Reader, parser elementParser) (err error) { + decoder := xml.NewDecoder(reader) + + for { + var token xml.Token + + token, err = decoder.Token() + + if errors.Is(err, io.EOF) { + err = nil + + break + } + + if err != nil { + return + } + + se, ok := token.(xml.StartElement) + if !ok { + continue + } + + if err = parser(decoder, &se); err != nil { + return + } + } + + return +} diff --git a/pkg/xcrawl3r/output.go b/pkg/xcrawl3r/output.go deleted file mode 100644 index 5b12081..0000000 --- a/pkg/xcrawl3r/output.go +++ /dev/null @@ -1,6 +0,0 @@ -package xcrawl3r - -type URL struct { - Source string - Value string -} diff --git a/pkg/xcrawl3r/page_strategy.go b/pkg/xcrawl3r/page_strategy.go index fa25a9d..a190b99 100644 --- a/pkg/xcrawl3r/page_strategy.go +++ b/pkg/xcrawl3r/page_strategy.go @@ -6,15 +6,15 @@ import ( "strings" "github.com/gocolly/colly/v2" - hqurl "github.com/hueristiq/hqgoutils/url" + hqgourl "github.com/hueristiq/hq-go-url" "github.com/hueristiq/xcrawl3r/pkg/browser" ) -func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { - URLsChannel = make(chan URL) +func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) <-chan Result { + results := make(chan Result) go func() { - defer close(URLsChannel) + defer close(results) if crawler.Render { // If we're using a proxy send it to the chrome instance @@ -26,6 +26,7 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { // If renderJavascript, pass the response's body to the renderer and then replace the body for .OnHTML to handle. crawler.PageCollector.OnResponse(func(request *colly.Response) { html := browser.GetRenderedSource(request.Request.URL.String()) + request.Body = []byte(html) }) } @@ -48,11 +49,21 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { } }) + crawler.FileCollector.OnError(func(_ *colly.Response, err error) { + result := Result{ + Type: ResultError, + Source: "page", + Error: err, + } + + results <- result + }) + crawler.PageCollector.OnHTML("[href]", func(e *colly.HTMLElement) { relativeURL := e.Attr("href") absoluteURL := e.Request.AbsoluteURL(relativeURL) - parsedAbsoluteURL, err := hqurl.Parse(absoluteURL) + parsedAbsoluteURL, err := up.Parse(absoluteURL) if err != nil { return } @@ -66,9 +77,23 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { return } - URLsChannel <- URL{Source: "page:href", Value: absoluteURL} + result := Result{ + Type: ResultURL, + Source: "page:href", + Value: absoluteURL, + } + + results <- result if err = e.Request.Visit(absoluteURL); err != nil { + result := Result{ + Type: ResultError, + Source: "page:href", + Error: err, + } + + results <- result + return } }) @@ -81,10 +106,24 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { return } - URLsChannel <- URL{Source: "page:src", Value: absoluteURL} + result := Result{ + Type: ResultURL, + Source: "page:src", + Value: absoluteURL, + } + + results <- result if match := crawler.FileURLsRegex.MatchString(absoluteURL); match { if err := crawler.FileCollector.Visit(absoluteURL); err != nil { + result := Result{ + Type: ResultError, + Source: "page:src", + Error: err, + } + + results <- result + return } @@ -92,6 +131,14 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { } if err := e.Request.Visit(absoluteURL); err != nil { + result := Result{ + Type: ResultError, + Source: "page:src", + Error: err, + } + + results <- result + return } }) @@ -112,9 +159,7 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { body := decode(string(response.Body)) URLs := crawler.URLsRegex.FindAllString(body, -1) - for index := range URLs { - fileURL := URLs[index] - + for _, fileURL := range URLs { // remove beginning and ending quotes fileURL = strings.Trim(fileURL, "\"") fileURL = strings.Trim(fileURL, "'") @@ -137,15 +182,37 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { continue } - URLsChannel <- URL{Source: "file:" + ext, Value: fileURL} + result := Result{ + Type: ResultURL, + Source: "file:" + ext, + Value: fileURL, + } + + results <- result if err := crawler.PageCollector.Visit(fileURL); err != nil { + result := Result{ + Type: ResultError, + Source: "file:" + ext, + Error: err, + } + + results <- result + return } } }) if err := crawler.PageCollector.Visit(parsedURL.String()); err != nil { + result := Result{ + Type: ResultError, + Source: "page", + Error: err, + } + + results <- result + return } @@ -153,5 +220,5 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { crawler.FileCollector.Wait() }() - return + return results } diff --git a/pkg/xcrawl3r/result.go b/pkg/xcrawl3r/result.go new file mode 100644 index 0000000..64f846d --- /dev/null +++ b/pkg/xcrawl3r/result.go @@ -0,0 +1,19 @@ +package xcrawl3r + +// Result represents the outcome of an operation or request, including the type of result, +// the source of the data, the actual value retrieved (if applicable), and any error encountered. +type Result struct { + Type ResultType // Specifies the type of result (e.g., a URL or an error). + Source string // Indicates the source from which the result was obtained (e.g., a specific API or service). + Value string // Holds the value of the result, such as a URL or any other data returned from the operation. + Error error // Holds any error that occurred during the operation, or nil if no error occurred. +} + +// ResultType defines the type of result using an integer type. It can represent different +// kinds of outcomes from an operation, such as a URL or an error. +type ResultType int + +const ( + ResultURL ResultType = iota // Represents a successful result containing a URL. + ResultError // Represents a result where an error occurred during the operation. +) diff --git a/pkg/xcrawl3r/robots_strategy.go b/pkg/xcrawl3r/robots_strategy.go index bc8d8ef..c51a40a 100644 --- a/pkg/xcrawl3r/robots_strategy.go +++ b/pkg/xcrawl3r/robots_strategy.go @@ -1,56 +1,109 @@ package xcrawl3r import ( + "errors" "fmt" "io" - "net/http" "regexp" "strings" - hqurl "github.com/hueristiq/hqgoutils/url" + hqgohttp "github.com/hueristiq/hq-go-http" + "github.com/hueristiq/hq-go-http/status" + hqgourl "github.com/hueristiq/hq-go-url" ) -func (crawler *Crawler) robotsParsing(parsedURL *hqurl.URL) (URLsChannel chan URL) { - URLsChannel = make(chan URL) +func (crawler *Crawler) robotsParsing(parsedURL *hqgourl.URL) <-chan Result { + results := make(chan Result) go func() { - defer close(URLsChannel) + defer close(results) robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host) - res, err := http.Get(robotsURL) //nolint:gosec // Works! + res, err := hqgohttp.Get(robotsURL) if err != nil { + result := Result{ + Type: ResultError, + Source: "known:robots", + Error: err, + } + + results <- result + return } defer res.Body.Close() - if res.StatusCode == 200 { - URLsChannel <- URL{Source: "known", Value: robotsURL} + if res.StatusCode != status.OK { + result := Result{ + Type: ResultError, + Source: "known:robots", + Error: errors.New("unexpected status code"), + } + + results <- result + + return + } + + result := Result{ + Type: ResultURL, + Source: "known:robots", + Value: robotsURL, + } + + results <- result - body, err := io.ReadAll(res.Body) - if err != nil { - return + body, err := io.ReadAll(res.Body) + if err != nil { + result := Result{ + Type: ResultError, + Source: "known:robots", + Error: err, } - lines := strings.Split(string(body), "\n") + results <- result - re := regexp.MustCompile(".*llow: ") + return + } + + lines := strings.Split(string(body), "\n") + + re := regexp.MustCompile(".*llow: ") + + for _, line := range lines { + if !strings.Contains(line, "llow: ") { + continue + } - for _, line := range lines { - if strings.Contains(line, "llow: ") { - rfURL := re.ReplaceAllString(line, "") - rfURL = fmt.Sprintf("%s://%s%s", parsedURL.Scheme, parsedURL.Host, rfURL) + rfURL := re.ReplaceAllString(line, "") - URLsChannel <- URL{Source: "robots", Value: rfURL} + rfURL = strings.ReplaceAll(rfURL, "*", "") + rfURL = strings.TrimPrefix(rfURL, "/") + rfURL = fmt.Sprintf("%s://%s/%s", parsedURL.Scheme, parsedURL.Host, rfURL) - if err = crawler.PageCollector.Visit(rfURL); err != nil { - continue - } + result := Result{ + Type: ResultURL, + Source: "robots", + Value: rfURL, + } + + results <- result + + if err = crawler.PageCollector.Visit(rfURL); err != nil { + result := Result{ + Type: ResultError, + Source: "robots", + Error: err, } + + results <- result + + continue } } }() - return + return results } diff --git a/pkg/xcrawl3r/sitemap_strategy.go b/pkg/xcrawl3r/sitemap_strategy.go index f1f6333..2853ed7 100644 --- a/pkg/xcrawl3r/sitemap_strategy.go +++ b/pkg/xcrawl3r/sitemap_strategy.go @@ -2,16 +2,18 @@ package xcrawl3r import ( "fmt" + "net/http" - hqurl "github.com/hueristiq/hqgoutils/url" - sitemap "github.com/oxffaa/gopher-parse-sitemap" + hqgohttp "github.com/hueristiq/hq-go-http" + hqgourl "github.com/hueristiq/hq-go-url" + sitemap "github.com/hueristiq/xcrawl3r/pkg/parser/sitemap" ) -func (crawler *Crawler) sitemapParsing(parsedURL *hqurl.URL) (URLsChannel chan URL) { - URLsChannel = make(chan URL) +func (crawler *Crawler) sitemapParsing(parsedURL *hqgourl.URL) <-chan Result { + results := make(chan Result) go func() { - defer close(URLsChannel) + defer close(results) sitemapPaths := []string{ "/sitemap.xml", @@ -31,24 +33,74 @@ func (crawler *Crawler) sitemapParsing(parsedURL *hqurl.URL) (URLsChannel chan U for _, path := range sitemapPaths { sitemapURL := fmt.Sprintf("%s://%s%s", parsedURL.Scheme, parsedURL.Host, path) - err := sitemap.ParseFromSite(sitemapURL, func(entry sitemap.Entry) (err error) { - smURL := entry.GetLocation() - - URLsChannel <- URL{Source: "sitemap", Value: smURL} - - if err = crawler.PageCollector.Visit(smURL); err != nil { - return + if err := crawler.parseSitemap(sitemapURL, results); err != nil { + result := Result{ + Type: ResultError, + Source: "known:sitemap", + Error: err, } - return - }) - if err != nil { + results <- result + continue } - URLsChannel <- URL{Source: "known", Value: sitemapURL} + result := Result{ + Type: ResultURL, + Source: "known:sitemap", + Value: sitemapURL, + } + + results <- result } }() + return results +} + +func (crawler *Crawler) parseSitemap(URL string, results chan Result) (err error) { + var res *http.Response + + res, err = hqgohttp.Get(URL) + if err != nil { + return + } + + if err = sitemap.Parse(res.Body, func(entry sitemap.Entry) (err error) { + sitemapEntryURL := entry.GetLocation() + + result := Result{ + Type: ResultURL, + Source: "sitemap", + Value: sitemapEntryURL, + } + + results <- result + + if entry.GetType() == sitemap.EntryTypeSitemap { + return crawler.parseSitemap(sitemapEntryURL, results) + } + + if err = crawler.PageCollector.Visit(sitemapEntryURL); err != nil { + result := Result{ + Type: ResultError, + Source: "sitemap", + Error: err, + } + + results <- result + + err = nil + + return + } + + return + }); err != nil { + return + } + + res.Body.Close() + return } diff --git a/pkg/xcrawl3r/utils.go b/pkg/xcrawl3r/utils.go index 5d013fe..5a4bfd6 100644 --- a/pkg/xcrawl3r/utils.go +++ b/pkg/xcrawl3r/utils.go @@ -3,7 +3,7 @@ package xcrawl3r import ( "strings" - hqurl "github.com/hueristiq/hqgoutils/url" + hqgourl "github.com/hueristiq/hq-go-url" ) func decode(source string) (decodedSource string) { @@ -17,10 +17,10 @@ func decode(source string) (decodedSource string) { return } -func (crawler *Crawler) fixURL(parsedURL *hqurl.URL, URL string) (fixedURL string) { +func (crawler *Crawler) fixURL(parsedURL *hqgourl.URL, URL string) (fixedURL string) { // decode // this .... - if strings.HasPrefix(URL, "http") { //nolint:gocritic // Works! + if strings.HasPrefix(URL, "http") { // `http://google.com` OR `https://google.com` fixedURL = URL } else if strings.HasPrefix(URL, "//") { @@ -50,15 +50,19 @@ func (crawler *Crawler) fixURL(parsedURL *hqurl.URL, URL string) (fixedURL strin } func (crawler *Crawler) IsInScope(URL string) (isInScope bool) { - parsedURL, err := hqurl.Parse(URL) + parsedURL, err := up.Parse(URL) if err != nil { return } + if parsedURL.Domain == nil { + return + } + if crawler.IncludeSubdomains { - isInScope = parsedURL.Domain == crawler.Domain || strings.HasSuffix(parsedURL.Domain, "."+crawler.Domain) + isInScope = parsedURL.Domain.String() == crawler.Domain || strings.HasSuffix(parsedURL.Domain.String(), "."+crawler.Domain) } else { - isInScope = parsedURL.Domain == crawler.Domain || parsedURL.Domain == "www."+crawler.Domain + isInScope = parsedURL.Domain.String() == crawler.Domain || parsedURL.Domain.String() == "www."+crawler.Domain } return diff --git a/pkg/xcrawl3r/xcrawl3r.go b/pkg/xcrawl3r/xcrawl3r.go index f4f4d3f..0f7a344 100644 --- a/pkg/xcrawl3r/xcrawl3r.go +++ b/pkg/xcrawl3r/xcrawl3r.go @@ -14,109 +14,208 @@ import ( "github.com/gocolly/colly/v2/debug" "github.com/gocolly/colly/v2/extensions" "github.com/gocolly/colly/v2/proxy" - hqurl "github.com/hueristiq/hqgoutils/url" + hqgourl "github.com/hueristiq/hq-go-url" + "github.com/hueristiq/xcrawl3r/internal/configuration" ) -type Options struct { //nolint:govet // To be refactored +type Crawler struct { Domain string IncludeSubdomains bool Seeds []string - Depth int - Headless bool - Headers []string - Proxies []string - Render bool - Timeout int // seconds + Headless bool + Headers []string + Proxies []string + Render bool + // Timeout int UserAgent string - Concurrency int - Delay int // seconds - MaxRandomDelay int // seconds - Parallelism int + // Concurrency int + Delay int + // MaxRandomDelay int + Parallelism int Debug bool + + FileURLsRegex *regexp.Regexp + + URLsNotToRequestRegex *regexp.Regexp + URLsRegex *regexp.Regexp + + PageCollector *colly.Collector + FileCollector *colly.Collector +} + +func (crawler *Crawler) Crawl() (results chan Result) { + results = make(chan Result) + + go func() { + defer close(results) + + seedsChannel := make(chan string, crawler.Parallelism) + + go func() { + defer close(seedsChannel) + + for index := range crawler.Seeds { + seed := crawler.Seeds[index] + + seedsChannel <- seed + } + }() + + URLsWG := new(sync.WaitGroup) + + for range crawler.Parallelism { + URLsWG.Add(1) + + go func() { + defer URLsWG.Done() + + for seed := range seedsChannel { + parsedSeed, err := up.Parse(seed) + if err != nil { + continue + } + + seenURLs := &sync.Map{} + + wg := &sync.WaitGroup{} + + wg.Add(1) + + go func() { + defer wg.Done() + + for URL := range crawler.sitemapParsing(parsedSeed) { + _, loaded := seenURLs.LoadOrStore(URL.Value, struct{}{}) + if loaded { + continue + } + + results <- URL + } + }() + + wg.Add(1) + + go func() { + defer wg.Done() + + for URL := range crawler.robotsParsing(parsedSeed) { + _, loaded := seenURLs.LoadOrStore(URL, struct{}{}) + if loaded { + continue + } + + results <- URL + } + }() + + wg.Add(1) + + go func() { + defer wg.Done() + + for URL := range crawler.pageCrawl(parsedSeed) { + _, loaded := seenURLs.LoadOrStore(URL, struct{}{}) + if loaded { + continue + } + + results <- URL + } + }() + + wg.Wait() + } + }() + } + + URLsWG.Wait() + }() + + return } -type Crawler struct { //nolint:govet // To be refactored +type Configuration struct { + Depth int + Domain string IncludeSubdomains bool Seeds []string - Depth int Headless bool Headers []string Proxies []string Render bool - Timeout int + Timeout int // seconds UserAgent string Concurrency int - Delay int - MaxRandomDelay int + Delay int // seconds + MaxRandomDelay int // seconds Parallelism int Debug bool - - PageCollector *colly.Collector - FileURLsRegex *regexp.Regexp - FileCollector *colly.Collector - URLsNotToRequestRegex *regexp.Regexp - URLsRegex *regexp.Regexp } -func New(options *Options) (crawler *Crawler, err error) { +var ( + DefaultUserAgent = fmt.Sprintf("%s v%s (https://github.com/hueristiq/%s)", configuration.NAME, configuration.VERSION, configuration.NAME) + up = hqgourl.NewParser() +) + +func New(cfg *Configuration) (crawler *Crawler, err error) { crawler = &Crawler{ - Domain: options.Domain, - IncludeSubdomains: options.IncludeSubdomains, - Seeds: options.Seeds, - - Depth: options.Depth, - Headless: options.Headless, - Headers: options.Headers, - Proxies: options.Proxies, - Render: options.Render, - Timeout: options.Timeout, - UserAgent: options.UserAgent, - - Concurrency: options.Concurrency, - Delay: options.Delay, - MaxRandomDelay: options.MaxRandomDelay, - Parallelism: options.Parallelism, - - Debug: options.Debug, + Domain: cfg.Domain, + IncludeSubdomains: cfg.IncludeSubdomains, + Seeds: cfg.Seeds, + + Headless: cfg.Headless, + Headers: cfg.Headers, + Proxies: cfg.Proxies, + Render: cfg.Render, + // Timeout: cfg.Timeout, + UserAgent: cfg.UserAgent, + + // Concurrency: cfg.Concurrency, + Delay: cfg.Delay, + // MaxRandomDelay: cfg.MaxRandomDelay, + Parallelism: cfg.Parallelism, + + Debug: cfg.Debug, } - crawler.URLsRegex = regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`) //nolint:gocritic // Works fine! + crawler.URLsRegex = hqgourl.NewExtractor().CompileRegex() - crawler.FileURLsRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`) //nolint:gocritic // Works fine! + crawler.FileURLsRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`) crawler.URLsNotToRequestRegex = regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`) crawler.PageCollector = colly.NewCollector( colly.Async(true), colly.IgnoreRobotsTxt(), - colly.MaxDepth(crawler.Depth), + colly.MaxDepth(cfg.Depth), colly.AllowedDomains(crawler.Domain, "www."+crawler.Domain), ) if crawler.IncludeSubdomains { - crawler.PageCollector.AllowedDomains = nil + crawler.PageCollector.AllowedDomains = []string{} - escapedDomain := regexp.QuoteMeta(crawler.Domain) - pattern := fmt.Sprintf(`https?://([a-z0-9.-]*\.)?%s(/[a-zA-Z0-9()/*\-+_~:,.?#=]*)?`, escapedDomain) + // pattern := fmt.Sprintf(`https?://([a-z0-9.-]*\.)?%s(/[a-zA-Z0-9()/*\-+_~:,.?#=]*)?`, regexp.QuoteMeta(crawler.Domain)) crawler.PageCollector.URLFilters = []*regexp.Regexp{ - regexp.MustCompile(pattern), + // regexp.MustCompile(pattern), + hqgourl.NewExtractor(hqgourl.ExtractorWithSchemePattern(`(?:https?)://`)).CompileRegex(), } } - crawler.PageCollector.SetRequestTimeout(time.Duration(crawler.Timeout) * time.Second) + crawler.PageCollector.SetRequestTimeout(time.Duration(cfg.Timeout) * time.Second) if err = crawler.PageCollector.Limit(&colly.LimitRule{ DomainGlob: "*", - Parallelism: crawler.Concurrency, - RandomDelay: time.Duration(crawler.MaxRandomDelay) * time.Second, + Parallelism: cfg.Concurrency, + RandomDelay: time.Duration(cfg.MaxRandomDelay) * time.Second, }); err != nil { return } @@ -132,7 +231,7 @@ func New(options *Options) (crawler *Crawler, err error) { var splitEntry []string - if strings.Contains(entry, ": ") { //nolint:gocritic // Works! + if strings.Contains(entry, ": ") { splitEntry = strings.SplitN(entry, ": ", 2) } else if strings.Contains(entry, ":") { splitEntry = strings.SplitN(entry, ":", 2) @@ -150,26 +249,30 @@ func New(options *Options) (crawler *Crawler, err error) { extensions.Referer(crawler.PageCollector) - switch ua := strings.ToLower(crawler.UserAgent); { - case strings.HasPrefix(ua, "mob"): - extensions.RandomMobileUserAgent(crawler.PageCollector) - case strings.HasPrefix(ua, "web"): - extensions.RandomUserAgent(crawler.PageCollector) - default: - crawler.PageCollector.UserAgent = crawler.UserAgent + if crawler.UserAgent == "" { + crawler.PageCollector.UserAgent = DefaultUserAgent + } else { + switch ua := strings.ToLower(crawler.UserAgent); { + case strings.HasPrefix(ua, "mob"): + extensions.RandomMobileUserAgent(crawler.PageCollector) + case strings.HasPrefix(ua, "web"): + extensions.RandomUserAgent(crawler.PageCollector) + default: + crawler.PageCollector.UserAgent = crawler.UserAgent + } } HTTPTransport := &http.Transport{ DialContext: (&net.Dialer{ - Timeout: time.Duration(crawler.Timeout) * time.Second, - KeepAlive: time.Duration(crawler.Timeout) * time.Second, + Timeout: time.Duration(cfg.Timeout) * time.Second, + KeepAlive: time.Duration(cfg.Timeout) * time.Second, }).DialContext, MaxIdleConns: 100, // Golang default is 100 MaxConnsPerHost: 1000, - IdleConnTimeout: time.Duration(crawler.Timeout) * time.Second, - TLSHandshakeTimeout: time.Duration(crawler.Timeout) * time.Second, + IdleConnTimeout: time.Duration(cfg.Timeout) * time.Second, + TLSHandshakeTimeout: time.Duration(cfg.Timeout) * time.Second, TLSClientConfig: &tls.Config{ - InsecureSkipVerify: true, //nolint:gosec // Intended + InsecureSkipVerify: true, Renegotiation: tls.RenegotiateOnceAsClient, }, } @@ -179,21 +282,25 @@ func New(options *Options) (crawler *Crawler, err error) { CheckRedirect: func(req *http.Request, via []*http.Request) (err error) { nextLocation := req.Response.Header.Get("Location") - var parsedLocation *hqurl.URL + var parsedLocation *hqgourl.URL - parsedLocation, err = hqurl.Parse(nextLocation) + parsedLocation, err = up.Parse(nextLocation) if err != nil { - return err + return + } + + if parsedLocation.Domain == nil { + return } - if crawler.IncludeSubdomains && - (parsedLocation.Domain == crawler.Domain || - strings.HasSuffix(parsedLocation.Domain, "."+crawler.Domain)) { - return nil + fmt.Println(parsedLocation) + + if cfg.IncludeSubdomains && (parsedLocation.Domain.String() == cfg.Domain || strings.HasSuffix(parsedLocation.Domain.String(), "."+cfg.Domain)) { + return } - if parsedLocation.Domain == crawler.Domain || parsedLocation.Domain == "www."+crawler.Domain { - return nil + if parsedLocation.Domain.String() == cfg.Domain || parsedLocation.Domain.String() == "www."+cfg.Domain { + return } return http.ErrUseLastResponse @@ -206,9 +313,7 @@ func New(options *Options) (crawler *Crawler, err error) { // Proxies // NOTE: Must come AFTER .SetClient calls if len(crawler.Proxies) > 0 { - var ( - rrps colly.ProxyFunc - ) + var rrps colly.ProxyFunc rrps, err = proxy.RoundRobinProxySwitcher(crawler.Proxies...) if err != nil { @@ -226,94 +331,3 @@ func New(options *Options) (crawler *Crawler, err error) { return } - -func (crawler *Crawler) Crawl() (URLsChannel chan URL) { - URLsChannel = make(chan URL) - - go func() { - defer close(URLsChannel) - - seedsChannel := make(chan string, crawler.Parallelism) - - go func() { - defer close(seedsChannel) - - for index := range crawler.Seeds { - seed := crawler.Seeds[index] - - seedsChannel <- seed - } - }() - - URLsWG := new(sync.WaitGroup) - - for i := 0; i < crawler.Parallelism; i++ { - URLsWG.Add(1) - - go func() { - defer URLsWG.Done() - - for seed := range seedsChannel { - parsedSeed, err := hqurl.Parse(seed) - if err != nil { - continue - } - - wg := &sync.WaitGroup{} - seen := &sync.Map{} - - wg.Add(1) - - go func() { - defer wg.Done() - - for URL := range crawler.sitemapParsing(parsedSeed) { - _, loaded := seen.LoadOrStore(URL.Value, struct{}{}) - if loaded { - continue - } - - URLsChannel <- URL - } - }() - - wg.Add(1) - - go func() { - defer wg.Done() - - for URL := range crawler.robotsParsing(parsedSeed) { - _, loaded := seen.LoadOrStore(URL, struct{}{}) - if loaded { - continue - } - - URLsChannel <- URL - } - }() - - wg.Add(1) - - go func() { - defer wg.Done() - - for URL := range crawler.pageCrawl(parsedSeed) { - _, loaded := seen.LoadOrStore(URL, struct{}{}) - if loaded { - continue - } - - URLsChannel <- URL - } - }() - - wg.Wait() - } - }() - } - - URLsWG.Wait() - }() - - return -}