From 6870c8e6608e7da003477c832a20c03bb92551c8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 4 Sep 2023 14:22:48 +0000 Subject: [PATCH 01/13] chore(deps): bump actions/checkout from 3 to 4 Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/build-test.yml | 2 +- .github/workflows/lint-test.yml | 2 +- .github/workflows/release.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index b7e2b53..faa6222 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -30,7 +30,7 @@ jobs: go-version: '>=1.20' - name: Checkout the code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint-test.yml index 51d5f90..3c60d70 100644 --- a/.github/workflows/lint-test.yml +++ b/.github/workflows/lint-test.yml @@ -30,7 +30,7 @@ jobs: go-version: '>=1.20' - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 21da946..9b61c92 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,7 +19,7 @@ jobs: go-version: '>=1.20' - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - From 0c606dbdb642e3fcf46760b3790d1c043e806959 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 14:32:41 +0000 Subject: [PATCH 02/13] chore(deps): bump goreleaser/goreleaser-action from 4 to 5 Bumps [goreleaser/goreleaser-action](https://github.com/goreleaser/goreleaser-action) from 4 to 5. - [Release notes](https://github.com/goreleaser/goreleaser-action/releases) - [Commits](https://github.com/goreleaser/goreleaser-action/compare/v4...v5) --- updated-dependencies: - dependency-name: goreleaser/goreleaser-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 21da946..b27d343 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: fetch-depth: 0 - name: Run GoReleaser - uses: goreleaser/goreleaser-action@v4 + uses: goreleaser/goreleaser-action@v5 with: args: "release --clean" version: latest From 98bffe54b5ebcda2c6ece4bcc0a4a3b77ea32390 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Oct 2023 14:43:52 +0000 Subject: [PATCH 03/13] chore(deps): bump github.com/chromedp/chromedp from 0.9.1 to 0.9.3 Bumps [github.com/chromedp/chromedp](https://github.com/chromedp/chromedp) from 0.9.1 to 0.9.3. - [Release notes](https://github.com/chromedp/chromedp/releases) - [Commits](https://github.com/chromedp/chromedp/compare/v0.9.1...v0.9.3) --- updated-dependencies: - dependency-name: github.com/chromedp/chromedp dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- go.mod | 6 +++--- go.sum | 13 ++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 9f3aad5..eac06c4 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/hueristiq/xcrawl3r go 1.20 require ( - github.com/chromedp/chromedp v0.9.1 + github.com/chromedp/chromedp v0.9.3 github.com/gocolly/colly/v2 v2.1.0 github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8 github.com/logrusorgru/aurora/v3 v3.0.0 @@ -17,12 +17,12 @@ require ( github.com/antchfx/htmlquery v1.2.3 // indirect github.com/antchfx/xmlquery v1.2.4 // indirect github.com/antchfx/xpath v1.1.8 // indirect - github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9 // indirect + github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998 // indirect github.com/chromedp/sysutil v1.0.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect - github.com/gobwas/ws v1.1.0 // indirect + github.com/gobwas/ws v1.3.0 // indirect github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect github.com/golang/protobuf v1.4.2 // indirect github.com/josharian/intern v1.0.0 // indirect diff --git a/go.sum b/go.sum index eb4465a..27cc525 100644 --- a/go.sum +++ b/go.sum @@ -13,10 +13,10 @@ github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNY github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk= github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9 h1:wMSvdj3BswqfQOXp2R1bJOAE7xIQLt2dlMQDMf836VY= -github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= -github.com/chromedp/chromedp v0.9.1 h1:CC7cC5p1BeLiiS2gfNNPwp3OaUxtRMBjfiw3E3k6dFA= -github.com/chromedp/chromedp v0.9.1/go.mod h1:DUgZWRvYoEfgi66CgZ/9Yv+psgi+Sksy5DTScENWjaQ= +github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998 h1:2zipcnjfFdqAjOQa8otCCh0Lk1M7RBzciy3s80YAKHk= +github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/chromedp v0.9.3 h1:Wq58e0dZOdHsxaj9Owmfcf+ibtpYN1N0FWVbaxa/esg= +github.com/chromedp/chromedp v0.9.3/go.mod h1:NipeUkUcuzIdFbBP8eNNvl9upcceOfWzoJn6cRe4ksA= github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= @@ -31,8 +31,8 @@ github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= -github.com/gobwas/ws v1.1.0 h1:7RFti/xnNkMJnrK7D1yQ/iCIB5OrrY/54/H930kIbHA= -github.com/gobwas/ws v1.1.0/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0= +github.com/gobwas/ws v1.3.0 h1:sbeU3Y4Qzlb+MOzIe6mQGf7QR4Hkv6ZD0qhGkBFL2O0= +github.com/gobwas/ws v1.3.0/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs= github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= @@ -112,7 +112,6 @@ golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201207223542-d4d67f95c62d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= From bc6fe83493d02c6bfc043852668297587ef1435a Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Mon, 27 Nov 2023 21:12:42 +0300 Subject: [PATCH 04/13] build: - --- .gitignore | 6 +----- Makefile | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 Makefile diff --git a/.gitignore b/.gitignore index 84bd03b..77a98d4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,3 @@ # Executable -cmd/xcrawl3r/xcrawl3r - -# Notes - -notes.txt \ No newline at end of file +bin \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d5cb06c --- /dev/null +++ b/Makefile @@ -0,0 +1,52 @@ +# Go(Golang) Options +GOCMD=go +GOMOD=$(GOCMD) mod +GOGET=$(GOCMD) get +GOFMT=$(GOCMD) fmt +GOTEST=$(GOCMD) test +GOBUILD=$(GOCMD) build +GOINSTALL=$(GOCMD) install +GOFLAGS := -v +LDFLAGS := -s -w + +# Golangci Options +GOLANGCILINTCMD=golangci-lint +GOLANGCILINTRUN=$(GOLANGCILINTCMD) run + +ifneq ($(shell go env GOOS),darwin) +LDFLAGS := -extldflags "-static" +endif + +all: build + +.PHONY: tidy +tidy: + $(GOMOD) tidy + +.PHONY: update-deps +update-deps: + $(GOGET) -f -t -u ./... + $(GOGET) -f -u ./... + +.PHONY: _gofmt +_gofmt: + $(GOFMT) ./... + +.PHONY: _golangci-lint +_golangci-lint: + $(GOLANGCILINTRUN) $(GOLANGCILINT) ./... + +.PHONY: lint +lint: _gofmt _golangci-lint + +.PHONY: test +test: + $(GOTEST) $(GOFLAGS) ./... + +.PHONY: build +build: + $(GOBUILD) $(GOFLAGS) -ldflags '$(LDFLAGS)' -o bin/xcrawl3r cmd/xcrawl3r/main.go + +.PHONY: install +install: + $(GOINSTALL) $(GOFLAGS) ./... From 4397a6b3bdcac7dc412c64ae245c7274419b113f Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Fri, 22 Dec 2023 08:53:04 +0300 Subject: [PATCH 05/13] chore(*): - --- .github/workflows/build-test.yml | 6 +- .github/workflows/codeql-analysis.yml | 46 ++++++ .github/workflows/lint-test.yml | 13 +- .github/workflows/release.yml | 21 +-- .golangci.yaml | 183 ++++++++++++++++++------ .goreleaser.yaml | 11 +- CONTRIBUTING.md | 6 +- Makefile | 46 +++--- README.md | 39 +++-- cmd/xcrawl3r/main.go | 102 ++++++------- go.mod | 42 +++--- go.sum | 121 ++++++++++++---- internal/configuration/configuration.go | 21 ++- pkg/browser/browser.go | 21 +-- pkg/xcrawl3r/page_strategy.go | 16 ++- pkg/xcrawl3r/robots_strategy.go | 11 +- pkg/xcrawl3r/sitemap_strategy.go | 4 +- pkg/xcrawl3r/utils.go | 8 +- pkg/xcrawl3r/xcrawl3r.go | 43 +++--- 19 files changed, 499 insertions(+), 261 deletions(-) create mode 100644 .github/workflows/codeql-analysis.yml diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index faa6222..5216c78 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -27,9 +27,9 @@ jobs: name: Set up Go uses: actions/setup-go@v4 with: - go-version: '>=1.20' + go-version: '>=1.21' - - name: Checkout the code + name: Checkout the repository uses: actions/checkout@v4 with: fetch-depth: 0 @@ -42,4 +42,4 @@ jobs: - name: Go build run: go build -v . - working-directory: ./cmd/xcrawl3r \ No newline at end of file + working-directory: ./cmd/xsubfind3r \ No newline at end of file diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 0000000..a978f59 --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,46 @@ +name: 🚨 CodeQL Analysis + +on: + push: + branches: + - "main" + paths: + - '**.go' + - '**.mod' + pull_request: + branches: + - "main" + paths: + - '**.go' + - '**.mod' + workflow_dispatch: + +jobs: + analyze: + name: CodeQL Analysis + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'go' ] + + steps: + - + name: Checkout repository + uses: actions/checkout@v4 + - + name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + - + name: Autobuild + uses: github/codeql-action/autobuild@v2 + - + name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 \ No newline at end of file diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint-test.yml index 3c60d70..88f9cdc 100644 --- a/.github/workflows/lint-test.yml +++ b/.github/workflows/lint-test.yml @@ -14,10 +14,10 @@ on: - '**.go' - '**.mod' workflow_dispatch: - + permissions: contents: read - + jobs: lint: name: Lint Test @@ -27,9 +27,10 @@ jobs: name: Set up Go uses: actions/setup-go@v4 with: - go-version: '>=1.20' + go-version: '>=1.21' + cache: false - - name: Checkout code + name: Checkout the repository uses: actions/checkout@v4 with: fetch-depth: 0 @@ -37,4 +38,6 @@ jobs: name: Run golangci-lint uses: golangci/golangci-lint-action@v3 with: - version: v1.52.2 + version: v1.54.2 + args: --timeout 5m + working-directory: . \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index df7ce09..2334759 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,11 +1,11 @@ -name: 🎉 release +name: 🎉 Release on: - create: - branches: - - main + push: tags: - - v*.*.* + - 'v*.*.*' + - '*.*.*' + workflow_dispatch: jobs: release: @@ -16,9 +16,9 @@ jobs: name: Set up Go uses: actions/setup-go@v4 with: - go-version: '>=1.20' + go-version: '>=1.21' - - name: Checkout code + name: Checkout the repository uses: actions/checkout@v4 with: fetch-depth: 0 @@ -26,10 +26,11 @@ jobs: name: Run GoReleaser uses: goreleaser/goreleaser-action@v5 with: - args: "release --clean" + distribution: goreleaser version: latest + args: "release --clean" + workdir: . env: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - SLACK_WEBHOOK: "${{ secrets.SLACK_WEBHOOK }}" DISCORD_WEBHOOK_ID: "${{ secrets.DISCORD_WEBHOOK_ID }}" - DISCORD_WEBHOOK_TOKEN: "${{ secrets.DISCORD_WEBHOOK_TOKEN }}" + DISCORD_WEBHOOK_TOKEN: "${{ secrets.DISCORD_WEBHOOK_TOKEN }}" \ No newline at end of file diff --git a/.golangci.yaml b/.golangci.yaml index b0b1ffd..2f183b5 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -1,61 +1,160 @@ run: - issues-exit-code: 1 + # Timeout for analysis, e.g. 30s, 5m. + # Default: 1m + timeout: 5m linters: + # Disable all linters. + # Default: false disable-all: true + # Enable specific linter enable: - - bodyclose - - depguard - - dogsled - - dupl - - errcheck - - exportloopref - - exhaustive - - goconst - - gocritic - - gofmt - - goimports - - gocyclo - - gosec - - gosimple - - govet - - ineffassign - - misspell - - nolintlint - - prealloc - - predeclared - - revive - - staticcheck - - stylecheck - - thelper - - tparallel - - typecheck - - unconvert - - unparam - - unused - - whitespace - - wsl + # Enabled by Default + - errcheck # errcheck is a program for checking for unchecked errors in Go code. These unchecked errors can be critical bugs in some cases [fast: false, auto-fix: false] + - gosimple # (megacheck) # Linter for Go source code that specializes in simplifying code [fast: false, auto-fix: false] + - govet # (vet, vetshadow) # Vet examines Go source code and reports suspicious constructs, such as Printf calls whose arguments do not align with the format string [fast: false, auto-fix: false] + - ineffassign # Detects when assignments to existing variables are not used [fast: true, auto-fix: false] + - staticcheck # (megacheck) # It's a set of rules from staticcheck. It's not the same thing as the staticcheck binary. The author of staticcheck doesn't support or approve the use of staticcheck as a library inside golangci-lint. [fast: false, auto-fix: false] + - unused # (megacheck) # Checks Go code for unused constants, variables, functions and types [fast: false, auto-fix: false] + # Disabled by Default + - asasalint # check for pass []any as any in variadic func(...any) [fast: false, auto-fix: false] + - asciicheck # Simple linter to check that your code does not contain non-ASCII identifiers [fast: true, auto-fix: false] + - bidichk # Checks for dangerous unicode character sequences [fast: true, auto-fix: false] + - bodyclose # checks whether HTTP response body is closed successfully [fast: false, auto-fix: false] + - containedctx # containedctx is a linter that detects struct contained context.Context field [fast: false, auto-fix: false] + - contextcheck # check whether the function uses a non-inherited context [fast: false, auto-fix: false] + # - cyclop # checks function and package cyclomatic complexity [fast: false, auto-fix: false] + # - deadcode # [deprecated] # Finds unused code [fast: false, auto-fix: false] + - decorder # check declaration order and count of types, constants, variables and functions [fast: true, auto-fix: false] + # - depguard # Go linter that checks if package imports are in a list of acceptable packages [fast: true, auto-fix: false] + - dogsled # Checks assignments with too many blank identifiers (e.g. x, _, _, _, := f()) [fast: true, auto-fix: false] + # - dupl # Tool for code clone detection [fast: true, auto-fix: false] + - dupword # checks for duplicate words in the source code [fast: true, auto-fix: true] + - durationcheck # check for two durations multiplied together [fast: false, auto-fix: false] + - errchkjson # Checks types passed to the json encoding functions. Reports unsupported types and optionally reports occasions, where the check for the returned error can be omitted. [fast: false, auto-fix: false] + - errname # Checks that sentinel errors are prefixed with the `Err` and error types are suffixed with the `Error`. [fast: false, auto-fix: false] + - errorlint # errorlint is a linter for that can be used to find code that will cause problems with the error wrapping scheme introduced in Go 1.13. [fast: false, auto-fix: false] + - execinquery # execinquery is a linter about query string checker in Query function which reads your Go src files and warning it finds [fast: false, auto-fix: false] + - exhaustive # check exhaustiveness of enum switch statements [fast: false, auto-fix: false] + # - exhaustivestruct # [deprecated] # Checks if all struct's fields are initialized [fast: false, auto-fix: false] + # - exhaustruct # Checks if all structure fields are initialized [fast: false, auto-fix: false] + - exportloopref # checks for pointers to enclosing loop variables [fast: false, auto-fix: false] + # - forbidigo # Forbids identifiers [fast: false, auto-fix: false] + - forcetypeassert # finds forced type assertions [fast: true, auto-fix: false] + # - funlen # Tool for detection of long functions [fast: true, auto-fix: false] + - gci # Gci controls Go package import order and makes it always deterministic. [fast: true, auto-fix: false] + - ginkgolinter # enforces standards of using ginkgo and gomega [fast: false, auto-fix: false] + - gocheckcompilerdirectives # Checks that go compiler directive comments (//go:) are valid. [fast: true, auto-fix: false] + # - gochecknoglobals # check that no global variables exist [fast: false, auto-fix: false] + # - gochecknoinits # Checks that no init functions are present in Go code [fast: true, auto-fix: false] + # - gocognit # Computes and checks the cognitive complexity of functions [fast: true, auto-fix: false] + - goconst # Finds repeated strings that could be replaced by a constant [fast: true, auto-fix: false] + - gocritic # Provides diagnostics that check for bugs, performance and style issues. [fast: false, auto-fix: false] + # - gocyclo # Computes and checks the cyclomatic complexity of functions [fast: true, auto-fix: false] + # - godot # Check if comments end in a period [fast: true, auto-fix: true] + # - godox # Tool for detection of FIXME, TODO and other comment keywords [fast: true, auto-fix: false] + # - goerr113 # Go linter to check the errors handling expressions [fast: false, auto-fix: false] + - gofmt # Gofmt checks whether code was gofmt-ed. By default this tool runs with -s option to check for code simplification [fast: true, auto-fix: true] + - gofumpt # Gofumpt checks whether code was gofumpt-ed. [fast: true, auto-fix: true] + - goheader # Checks is file header matches to pattern [fast: true, auto-fix: false] + - goimports # Check import statements are formatted according to the 'goimport' command. Reformat imports in autofix mode. [fast: true, auto-fix: true] + # - golint # [deprecated] # Golint differs from gofmt. Gofmt reformats Go source code, whereas golint prints out style mistakes [fast: false, auto-fix: false] + # - gomnd # An analyzer to detect magic numbers. [fast: true, auto-fix: false] + - gomoddirectives # Manage the use of 'replace', 'retract', and 'excludes' directives in go.mod. [fast: true, auto-fix: false] + - gomodguard # Allow and block list linter for direct Go module dependencies. This is different from depguard where there are different block types for example version constraints and module recommendations. [fast: true, auto-fix: false] + - goprintffuncname # Checks that printf-like functions are named with `f` at the end [fast: true, auto-fix: false] + # - gosec # (gas) # Inspects source code for security problems [fast: false, auto-fix: false] + - gosmopolitan # Report certain i18n/l10n anti-patterns in your Go codebase [fast: false, auto-fix: false] + - grouper # An analyzer to analyze expression groups. [fast: true, auto-fix: false] + # - ifshort # [deprecated] # Checks that your code uses short syntax for if-statements whenever possible [fast: true, auto-fix: false] + - importas # Enforces consistent import aliases [fast: false, auto-fix: false] + - interfacebloat # A linter that checks the number of methods inside an interface. [fast: true, auto-fix: false] + # - interfacer # [deprecated] # Linter that suggests narrower interface types [fast: false, auto-fix: false] + # - ireturn # Accept Interfaces, Return Concrete Types [fast: false, auto-fix: false] + # - lll # Reports long lines [fast: true, auto-fix: false] + - loggercheck # (logrlint) # Checks key value pairs for common logger libraries (kitlog,klog,logr,zap). [fast: false, auto-fix: false] + - maintidx # maintidx measures the maintainability index of each function. [fast: true, auto-fix: false] + - makezero # Finds slice declarations with non-zero initial length [fast: false, auto-fix: false] + # - maligned # [deprecated] # Tool to detect Go structs that would take less memory if their fields were sorted [fast: false, auto-fix: false] + - mirror # reports wrong mirror patterns of bytes/strings usage [fast: false, auto-fix: false] + - misspell # Finds commonly misspelled English words in comments [fast: true, auto-fix: true] + - musttag # enforce field tags in (un)marshaled structs [fast: false, auto-fix: false] + # - nakedret # Finds naked returns in functions greater than a specified function length [fast: true, auto-fix: false] + - nestif # Reports deeply nested if statements [fast: true, auto-fix: false] + - nilerr # Finds the code that returns nil even if it checks that the error is not nil. [fast: false, auto-fix: false] + - nilnil # Checks that there is no simultaneous return of `nil` error and an invalid value. [fast: false, auto-fix: false] + - nlreturn # nlreturn checks for a new line before return and branch statements to increase code clarity [fast: true, auto-fix: false] + - noctx # noctx finds sending http request without context.Context [fast: false, auto-fix: false] + - nolintlint # Reports ill-formed or insufficient nolint directives [fast: true, auto-fix: false] + # - nonamedreturns # Reports all named returns [fast: false, auto-fix: false] + # - nosnakecase # [deprecated] # nosnakecase is a linter that detects snake case of variable naming and function name. [fast: true, auto-fix: false] + - nosprintfhostport # Checks for misuse of Sprintf to construct a host with port in a URL. [fast: true, auto-fix: false] + - paralleltest # paralleltest detects missing usage of t.Parallel() method in your Go test [fast: false, auto-fix: false] + - prealloc # Finds slice declarations that could potentially be pre-allocated [fast: true, auto-fix: false] + - predeclared # find code that shadows one of Go's predeclared identifiers [fast: true, auto-fix: false] + # - promlinter # Check Prometheus metrics naming via promlint [fast: true, auto-fix: false] + - reassign # Checks that package variables are not reassigned [fast: false, auto-fix: false] + - revive # Fast, configurable, extensible, flexible, and beautiful linter for Go. Drop-in replacement of golint. [fast: false, auto-fix: false] + - rowserrcheck # checks whether Err of rows is checked successfully [fast: false, auto-fix: false] + # - scopelint # [deprecated] # Scopelint checks for unpinned variables in go programs [fast: true, auto-fix: false] + - sqlclosecheck # Checks that sql.Rows and sql.Stmt are closed. [fast: false, auto-fix: false] + # - structcheck # [deprecated] # Finds unused struct fields [fast: false, auto-fix: false] + - stylecheck # Stylecheck is a replacement for golint [fast: false, auto-fix: false] + - tagalign # check that struct tags are well aligned [fast: true, auto-fix: true] + # - tagliatelle # Checks the struct tags. [fast: true, auto-fix: false] + - tenv # tenv is analyzer that detects using os.Setenv instead of t.Setenv since Go1.17 [fast: false, auto-fix: false] + - testableexamples # linter checks if examples are testable (have an expected output) [fast: true, auto-fix: false] + - testpackage # linter that makes you use a separate _test package [fast: true, auto-fix: false] + - thelper # thelper detects Go test helpers without t.Helper() call and checks the consistency of test helpers [fast: false, auto-fix: false] + - tparallel # tparallel detects inappropriate usage of t.Parallel() method in your Go test codes [fast: false, auto-fix: false] + - unconvert # Remove unnecessary type conversions [fast: false, auto-fix: false] + - unparam # Reports unused function parameters [fast: false, auto-fix: false] + - usestdlibvars # A linter that detect the possibility to use variables/constants from the Go standard library. [fast: true, auto-fix: false] + # - varcheck # [deprecated] # Finds unused global variables and constants [fast: false, auto-fix: false] + # - varnamelen # checks that the length of a variable's name matches its scope [fast: false, auto-fix: false] + - wastedassign # wastedassign finds wasted assignment statements. [fast: false, auto-fix: false] + - whitespace # Tool for detection of leading and trailing whitespace [fast: true, auto-fix: true] + # - wrapcheck # Checks that errors returned from external packages are wrapped [fast: false, auto-fix: false] + - wsl # Whitespace Linter - Forces you to use empty lines! [fast: true, auto-fix: false] + - zerologlint # Detects the wrong usage of `zerolog` that a user forgets to dispatch with `Send` or `Msg`. [fast: false, auto-fix: false] linters-settings: - errcheck: - check-type-assertions: true goconst: min-len: 2 min-occurrences: 3 gocritic: enabled-tags: - - style - - diagnostic - performance - experimental + - style - opinionated disabled-checks: - captLocal - - octalLiteral + - whyNoLint + gocyclo: + # Minimal code complexity to report. + # Default: 30 (but we recommend 10-20) + min-complexity: 10 govet: check-shadowing: true - enable: - - fieldalignment - nolintlint: - require-explanation: true - require-specific: true \ No newline at end of file + varnamelen: + # The minimum length of a variable's name that is considered "long". + # Variable names that are at least this long will be ignored. + # Default: 3 + min-name-length: 2 + # Check method receivers. + # Default: false + check-receiver: true + # Check named return values. + # Default: false + check-return: true + # Check type parameters. + # Default: false + check-type-param: true + whitespace: + # Enforces newlines (or comments) after every multi-line if statement. + # Default: false + multi-if: true + # Enforces newlines (or comments) after every multi-line function signature. + # Default: false + multi-func: true \ No newline at end of file diff --git a/.goreleaser.yaml b/.goreleaser.yaml index d15d87c..ca203b6 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -37,10 +37,9 @@ builds: archives: - id: tgz - builds: [xcrawl3r-cli] + builds: + - xcrawl3r-cli format: tar.gz - replacements: - darwin: macOS format_overrides: - goos: windows @@ -50,12 +49,6 @@ checksum: algorithm: sha256 announce: - slack: - enabled: true - channel: '#release' - username: GoReleaser - message_template: 'New Release: {{ .ProjectName }} {{.Tag}} is published! Check it out at {{ .ReleaseURL }}' - discord: enabled: true message_template: '**New Release: {{ .ProjectName }} {{.Tag}}** is published! Check it out at {{ .ReleaseURL }}' \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3027da0..e9a0b69 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,14 +41,14 @@ Pull requests should target the `dev` branch. Please also reference the issue fr When submitting code, please make every effort to follow existing conventions and style in order to keep the code as readable as possible. Here are a few points to keep in mind: -* Please run `go fmt ./...` before committing to ensure code aligns with go standards. -* We use [`golangci-lint`](https://golangci-lint.run/) for linting Go code, run `golangci-lint run --fix` before submitting PR. Editors such as Visual Studio Code or JetBrains IntelliJ; with Go support plugin will offer `golangci-lint` automatically. * All dependencies must be defined in the `go.mod` file. * Advanced IDEs and code editors (like VSCode) will take care of that, but to be sure, run `go mod tidy` to validate dependencies. +* Please run `go fmt ./...` before committing to ensure code aligns with go standards. +* We use [`golangci-lint`](https://golangci-lint.run/) for linting Go code, run `golangci-lint run --fix` before submitting PR. Editors such as Visual Studio Code or JetBrains IntelliJ; with Go support plugin will offer `golangci-lint` automatically. * For details on the approved style, check out [Effective Go](https://golang.org/doc/effective_go.html). ### License -By contributing your code, you agree to license your contribution under the terms of the [MIT License](./LICENSE). +By contributing your code, you agree to license your contribution under the terms of the [MIT License](https://github.com/hueristiq/xcrawl3r/blob/master/LICENSE). All files are released with the MIT license. diff --git a/Makefile b/Makefile index d5cb06c..4276669 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,8 @@ -# Go(Golang) Options +SHELL = /bin/bash + +all: go-build + +# --- Go(Golang) ------------------------------------------------------------------------------------ GOCMD=go GOMOD=$(GOCMD) mod GOGET=$(GOCMD) get @@ -9,44 +13,38 @@ GOINSTALL=$(GOCMD) install GOFLAGS := -v LDFLAGS := -s -w -# Golangci Options -GOLANGCILINTCMD=golangci-lint -GOLANGCILINTRUN=$(GOLANGCILINTCMD) run - ifneq ($(shell go env GOOS),darwin) LDFLAGS := -extldflags "-static" endif -all: build +GOLANGCILINTCMD=golangci-lint +GOLANGCILINTRUN=$(GOLANGCILINTCMD) run -.PHONY: tidy -tidy: +.PHONY: go-mod-tidy +go-mod-tidy: $(GOMOD) tidy -.PHONY: update-deps -update-deps: +.PHONY: go-mod-update +go-mod-update: $(GOGET) -f -t -u ./... $(GOGET) -f -u ./... -.PHONY: _gofmt -_gofmt: +.PHONY: go-fmt +go-fmt: $(GOFMT) ./... -.PHONY: _golangci-lint -_golangci-lint: +.PHONY: go-lint +go-lint: go-fmt $(GOLANGCILINTRUN) $(GOLANGCILINT) ./... -.PHONY: lint -lint: _gofmt _golangci-lint - -.PHONY: test -test: +.PHONY: go-test +go-test: $(GOTEST) $(GOFLAGS) ./... -.PHONY: build -build: +.PHONY: go-build +go-build: $(GOBUILD) $(GOFLAGS) -ldflags '$(LDFLAGS)' -o bin/xcrawl3r cmd/xcrawl3r/main.go -.PHONY: install -install: - $(GOINSTALL) $(GOFLAGS) ./... +.PHONY: go-install +go-install: + $(GOINSTALL) $(GOFLAGS) ./... \ No newline at end of file diff --git a/README.md b/README.md index 2456159..827896a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # xcrawl3r -![made with go](https://img.shields.io/badge/made%20with-Go-0000FF.svg) [![release](https://img.shields.io/github/release/hueristiq/xcrawl3r?style=flat&color=0000FF)](https://github.com/hueristiq/xcrawl3r/releases) [![license](https://img.shields.io/badge/license-MIT-gray.svg?color=0000FF)](https://github.com/hueristiq/xcrawl3r/blob/master/LICENSE) ![maintenance](https://img.shields.io/badge/maintained%3F-yes-0000FF.svg) [![open issues](https://img.shields.io/github/issues-raw/hueristiq/xcrawl3r.svg?style=flat&color=0000FF)](https://github.com/hueristiq/xcrawl3r/issues?q=is:issue+is:open) [![closed issues](https://img.shields.io/github/issues-closed-raw/hueristiq/xcrawl3r.svg?style=flat&color=0000FF)](https://github.com/hueristiq/xcrawl3r/issues?q=is:issue+is:closed) [![contribution](https://img.shields.io/badge/contributions-welcome-0000FF.svg)](https://github.com/hueristiq/xcrawl3r/blob/master/CONTRIBUTING.md) +![made with go](https://img.shields.io/badge/made%20with-Go-1E90FF.svg) [![go report card](https://goreportcard.com/badge/github.com/hueristiq/xcrawl3r)](https://goreportcard.com/report/github.com/hueristiq/xcrawl3r) [![release](https://img.shields.io/github/release/hueristiq/xcrawl3r?style=flat&color=1E90FF)](https://github.com/hueristiq/xcrawl3r/releases) [![open issues](https://img.shields.io/github/issues-raw/hueristiq/xcrawl3r.svg?style=flat&color=1E90FF)](https://github.com/hueristiq/xcrawl3r/issues?q=is:issue+is:open) [![closed issues](https://img.shields.io/github/issues-closed-raw/hueristiq/xcrawl3r.svg?style=flat&color=1E90FF)](https://github.com/hueristiq/xcrawl3r/issues?q=is:issue+is:closed) [![license](https://img.shields.io/badge/license-MIT-gray.svg?color=1E90FF)](https://github.com/hueristiq/xcrawl3r/blob/master/LICENSE) ![maintenance](https://img.shields.io/badge/maintained%3F-yes-1E90FF.svg) [![contribution](https://img.shields.io/badge/contributions-welcome-1E90FF.svg)](https://github.com/hueristiq/xcrawl3r/blob/master/CONTRIBUTING.md) -`xcrawl3r` is a command-line interface (CLI) utility to recursively crawl webpages i.e systematically browse webpages' URLs and follow links to discover linked webpages' URLs. +`xcrawl3r` is a command-line interface (CLI) based utility to recursively crawl webpages. It is designed to systematically browse webpages' URLs and follow links to discover linked webpages' URLs. ## Resources @@ -16,6 +16,8 @@ * [Contributing](#contributing) * [Licensing](#licensing) * [Credits](#credits) + * [Contributors](#contributors) + * [Similar Projects](#similar-projects) ## Features @@ -50,13 +52,15 @@ Visit the [releases page](https://github.com/hueristiq/xcrawl3r/releases) and fi tar xf xcrawl3r--linux-amd64.tar.gz ``` -> **TIP:** The above steps, download and extract, can be combined into a single step with this onliner +> [!TIP] +> The above steps, download and extract, can be combined into a single step with this onliner > > ```bash > curl -sL https://github.com/hueristiq/xcrawl3r/releases/download/v/xcrawl3r--linux-amd64.tar.gz | tar -xzv > ``` -**NOTE:** On Windows systems, you should be able to double-click the zip archive to extract the `xcrawl3r` executable. +> [!NOTE] +> On Windows systems, you should be able to double-click the zip archive to extract the `xcrawl3r` executable. ...move the `xcrawl3r` binary to somewhere in your `PATH`. For example, on GNU/Linux and OS X systems: @@ -64,7 +68,8 @@ tar xf xcrawl3r--linux-amd64.tar.gz sudo mv xcrawl3r /usr/local/bin/ ``` -**NOTE:** Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. +> [!NOTE] +> Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. ### Install source (With Go Installed) @@ -96,11 +101,12 @@ go install -v github.com/hueristiq/xcrawl3r/cmd/xcrawl3r@latest ```bash sudo mv xcrawl3r /usr/local/bin/ ``` + > [!NOTE] + > Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. - **NOTE:** Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. - -**NOTE:** While the development version is a good way to take a peek at `xcrawl3r`'s latest features before they get released, be aware that it may have bugs. Officially released versions will generally be more stable. +> [!CAUTION] +> While the development version is a good way to take a peek at `xcrawl3r`'s latest features before they get released, be aware that it may have bugs. Officially released versions will generally be more stable. ## Usage @@ -162,15 +168,22 @@ OUTPUT: ## Contributing -[Issues](https://github.com/hueristiq/xcrawl3r/issues) and [Pull Requests](https://github.com/hueristiq/xcrawl3r/pulls) are welcome! **Check out the [contribution guidelines](./CONTRIBUTING.md).** +[Issues](https://github.com/hueristiq/xcrawl3r/issues) and [Pull Requests](https://github.com/hueristiq/xcrawl3r/pulls) are welcome! **Check out the [contribution guidelines](https://github.com/hueristiq/xcrawl3r/blob/master/CONTRIBUTING.md).** ## Licensing -This utility is distributed under the [MIT license](./LICENSE). - +This utility is distributed under the [MIT license](https://github.com/hueristiq/xcrawl3r/blob/master/LICENSE). ## Credits -* Alternatives - Check out projects below, that may fit in your workflow: +### Contributors + +Thanks to the amazing [contributors](https://github.com/hueristiq/xcrawl3r/graphs/contributors) for keeping this project alive. + +[![contributors](https://contrib.rocks/image?repo=hueristiq/xcrawl3r&max=500)](https://github.com/hueristiq/xcrawl3r/graphs/contributors) + +### Similar Projects + +Thanks to similar open source projects - check them out, may fit in your workflow. - [katana](https://github.com/projectdiscovery/katana) ◇ [gospider](https://github.com/jaeles-project/gospider) ◇ [hakrawler](https://github.com/hakluke/hakrawler) ◇ [urlgrab](https://github.com/IAmStoxe/urlgrab) \ No newline at end of file +[katana](https://github.com/projectdiscovery/katana) ◇ [gospider](https://github.com/jaeles-project/gospider) ◇ [hakrawler](https://github.com/hakluke/hakrawler) ◇ [urlgrab](https://github.com/IAmStoxe/urlgrab) \ No newline at end of file diff --git a/cmd/xcrawl3r/main.go b/cmd/xcrawl3r/main.go index 9be1003..1e8e738 100644 --- a/cmd/xcrawl3r/main.go +++ b/cmd/xcrawl3r/main.go @@ -7,10 +7,10 @@ import ( "os" "path/filepath" - hqlog "github.com/hueristiq/hqgoutils/log" - "github.com/hueristiq/hqgoutils/log/formatter" - "github.com/hueristiq/hqgoutils/log/levels" - hqurl "github.com/hueristiq/hqgoutils/url" + "github.com/hueristiq/hqgolog" + "github.com/hueristiq/hqgolog/formatter" + "github.com/hueristiq/hqgolog/levels" + "github.com/hueristiq/hqgourl" "github.com/hueristiq/xcrawl3r/internal/configuration" "github.com/hueristiq/xcrawl3r/pkg/xcrawl3r" "github.com/logrusorgru/aurora/v3" @@ -57,7 +57,7 @@ func init() { pflag.StringSliceVar(&proxies, "proxy", []string{}, "") pflag.BoolVar(&render, "render", false, "") pflag.IntVar(&timeout, "timeout", 10, "") - pflag.StringVar(&userAgent, "user-agent", "web", "") + pflag.StringVar(&userAgent, "user-agent", xcrawl3r.DefaultUserAgent, "") pflag.IntVarP(&concurrency, "concurrency", "c", 10, "") pflag.IntVar(&delay, "delay", 0, "") @@ -73,42 +73,42 @@ func init() { pflag.Usage = func() { fmt.Fprintln(os.Stderr, configuration.BANNER) - h := "USAGE:\n" + h := "\nUSAGE:\n" h += " xcrawl3r [OPTIONS]\n" h += "\nINPUT:\n" - h += " -d, --domain string domain to match URLs\n" - h += " --include-subdomains bool match subdomains' URLs\n" - h += " -s, --seeds string seed URLs file (use `-` to get from stdin)\n" - h += " -u, --url string URL to crawl\n" + h += " -d, --domain string domain to match URLs\n" + h += " --include-subdomains bool match subdomains' URLs\n" + h += " -s, --seeds string seed URLs file (use `-` to get from stdin)\n" + h += " -u, --url string URL to crawl\n" h += "\nCONFIGURATION:\n" - h += " --depth int maximum depth to crawl (default 3)\n" - h += " TIP: set it to `0` for infinite recursion\n" - h += " --headless bool If true the browser will be displayed while crawling.\n" - h += " -H, --headers string[] custom header to include in requests\n" - h += " e.g. -H 'Referer: http://example.com/'\n" - h += " TIP: use multiple flag to set multiple headers\n" - h += " --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080)\n" - h += " TIP: use multiple flag to set multiple proxies\n" - h += " --render bool utilize a headless chrome instance to render pages\n" - h += " --timeout int time to wait for request in seconds (default: 10)\n" - h += " --user-agent string User Agent to use (default: web)\n" - h += " TIP: use `web` for a random web user-agent,\n" - h += " `mobile` for a random mobile user-agent,\n" - h += " or you can set your specific user-agent.\n" + h += " --depth int maximum depth to crawl (default 3)\n" + h += " TIP: set it to `0` for infinite recursion\n" + h += " --headless bool If true the browser will be displayed while crawling.\n" + h += " -H, --headers string[] custom header to include in requests\n" + h += " e.g. -H 'Referer: http://example.com/'\n" + h += " TIP: use multiple flag to set multiple headers\n" + h += " --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080)\n" + h += " TIP: use multiple flag to set multiple proxies\n" + h += " --render bool utilize a headless chrome instance to render pages\n" + h += " --timeout int time to wait for request in seconds (default: 10)\n" + h += fmt.Sprintf(" --user-agent string User Agent to use (default: %s)\n", xcrawl3r.DefaultUserAgent) + h += " TIP: use `web` for a random web user-agent,\n" + h += " `mobile` for a random mobile user-agent,\n" + h += " or you can set your specific user-agent.\n" h += "\nRATE LIMIT:\n" - h += " -c, --concurrency int number of concurrent fetchers to use (default 10)\n" - h += " --delay int delay between each request in seconds\n" - h += " --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s)\n" - h += " -p, --parallelism int number of concurrent URLs to process (default: 10)\n" + h += " -c, --concurrency int number of concurrent fetchers to use (default 10)\n" + h += " --delay int delay between each request in seconds\n" + h += " --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s)\n" + h += " -p, --parallelism int number of concurrent URLs to process (default: 10)\n" h += "\nOUTPUT:\n" - h += " --debug bool enable debug mode (default: false)\n" - h += " -m, --monochrome bool coloring: no colored output mode\n" - h += " -o, --output string output file to write found URLs\n" - h += " -v, --verbosity string debug, info, warning, error, fatal or silent (default: debug)\n" + h += " --debug bool enable debug mode (default: false)\n" + h += " -m, --monochrome bool coloring: no colored output mode\n" + h += " -o, --output string output file to write found URLs\n" + h += " -v, --verbosity string debug, info, warning, error, fatal or silent (default: debug)\n" fmt.Fprint(os.Stderr, h) } @@ -116,8 +116,8 @@ func init() { pflag.Parse() // Initialize logger - hqlog.DefaultLogger.SetMaxLevel(levels.LevelStr(verbosity)) - hqlog.DefaultLogger.SetFormatter(formatter.NewCLI(&formatter.CLIOptions{ + hqgolog.DefaultLogger.SetMaxLevel(levels.LevelStr(verbosity)) + hqgolog.DefaultLogger.SetFormatter(formatter.NewCLI(&formatter.CLIOptions{ Colorize: !monochrome, })) @@ -130,7 +130,7 @@ func main() { } if seedsFile != "" && URL == "" && domain == "" { - hqlog.Fatal().Msg("using `-s, --seeds` requires either `-d, --domain` or `-u, --url` to be set!") + hqgolog.Fatal().Msg("using `-s, --seeds` requires either `-d, --domain` or `-u, --url` to be set!") } // Load input URLs @@ -155,21 +155,21 @@ func main() { case seedsFile != "" && seedsFile == "-": stat, err = os.Stdin.Stat() if err != nil { - hqlog.Fatal().Msg("no stdin") + hqgolog.Fatal().Msg("no stdin") } if stat.Mode()&os.ModeNamedPipe == 0 { - hqlog.Fatal().Msg("no stdin") + hqgolog.Fatal().Msg("no stdin") } file = os.Stdin case seedsFile != "" && seedsFile != "-": file, err = os.Open(seedsFile) if err != nil { - hqlog.Fatal().Msg(err.Error()) + hqgolog.Fatal().Msg(err.Error()) } default: - hqlog.Fatal().Msg("xcrawl3r takes input from stdin or file using a flag") + hqgolog.Fatal().Msg("xcrawl3r takes input from stdin or file using a flag") } scanner := bufio.NewScanner(file) @@ -183,13 +183,13 @@ func main() { } if scanner.Err() != nil { - hqlog.Fatal().Msgf("%s", err) + hqgolog.Fatal().Msgf("%s", err) } } - parsedURL, err := hqurl.Parse(domain) + parsedURL, err := hqgourl.Parse(domain) if err != nil { - hqlog.Fatal().Msgf("%s", err) + hqgolog.Fatal().Msgf("%s", err) } options := &xcrawl3r.Options{ @@ -215,7 +215,7 @@ func main() { crawler, err := xcrawl3r.New(options) if err != nil { - hqlog.Fatal().Msgf("%s", err) + hqgolog.Fatal().Msgf("%s", err) } URLs := crawler.Crawl() @@ -225,13 +225,13 @@ func main() { if _, err := os.Stat(directory); os.IsNotExist(err) { if err = os.MkdirAll(directory, os.ModePerm); err != nil { - hqlog.Fatal().Msg(err.Error()) + hqgolog.Fatal().Msg(err.Error()) } } - file, err := os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + file, err := os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) if err != nil { - hqlog.Fatal().Msg(err.Error()) + hqgolog.Fatal().Msg(err.Error()) } defer file.Close() @@ -240,23 +240,23 @@ func main() { for outputURL := range URLs { if verbosity == string(levels.LevelSilent) { - hqlog.Print().Msg(outputURL.Value) + hqgolog.Print().Msg(outputURL.Value) } else { - hqlog.Print().Msgf("[%s] %s", au.BrightBlue(outputURL.Source), outputURL.Value) + hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(outputURL.Source), outputURL.Value) } fmt.Fprintln(writer, outputURL.Value) } if err = writer.Flush(); err != nil { - hqlog.Fatal().Msg(err.Error()) + hqgolog.Fatal().Msg(err.Error()) } } else { for outputURL := range URLs { if verbosity == string(levels.LevelSilent) { - hqlog.Print().Msg(outputURL.Value) + hqgolog.Print().Msg(outputURL.Value) } else { - hqlog.Print().Msgf("[%s] %s", au.BrightBlue(outputURL.Source), outputURL.Value) + hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(outputURL.Source), outputURL.Value) } } } diff --git a/go.mod b/go.mod index eac06c4..af13f08 100644 --- a/go.mod +++ b/go.mod @@ -1,39 +1,43 @@ module github.com/hueristiq/xcrawl3r -go 1.20 +go 1.21.0 require ( github.com/chromedp/chromedp v0.9.3 github.com/gocolly/colly/v2 v2.1.0 - github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8 + github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead + github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f + github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53 github.com/logrusorgru/aurora/v3 v3.0.0 github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 github.com/spf13/pflag v1.0.5 ) require ( - github.com/PuerkitoBio/goquery v1.5.1 // indirect - github.com/andybalholm/cascadia v1.2.0 // indirect - github.com/antchfx/htmlquery v1.2.3 // indirect - github.com/antchfx/xmlquery v1.2.4 // indirect - github.com/antchfx/xpath v1.1.8 // indirect - github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998 // indirect + github.com/Mzack9999/go-http-digest-auth-client v0.6.0 // indirect + github.com/PuerkitoBio/goquery v1.8.1 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/antchfx/htmlquery v1.3.0 // indirect + github.com/antchfx/xmlquery v1.3.18 // indirect + github.com/antchfx/xpath v1.2.5 // indirect + github.com/chromedp/cdproto v0.0.0-20231205062650-00455a960d61 // indirect github.com/chromedp/sysutil v1.0.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect - github.com/gobwas/ws v1.3.0 // indirect - github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect - github.com/golang/protobuf v1.4.2 // indirect + github.com/gobwas/ws v1.3.1 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect github.com/mailru/easyjson v0.7.7 // indirect - github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect - github.com/temoto/robotstxt v1.1.1 // indirect - golang.org/x/net v0.10.0 // indirect - golang.org/x/sys v0.8.0 // indirect - golang.org/x/term v0.8.0 // indirect - golang.org/x/text v0.9.0 // indirect - google.golang.org/appengine v1.6.6 // indirect - google.golang.org/protobuf v1.24.0 // indirect + github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + golang.org/x/net v0.19.0 // indirect + golang.org/x/sys v0.15.0 // indirect + golang.org/x/term v0.15.0 // indirect + golang.org/x/text v0.14.0 // indirect + google.golang.org/appengine v1.6.8 // indirect + google.golang.org/protobuf v1.31.0 // indirect ) diff --git a/go.sum b/go.sum index 27cc525..d011f13 100644 --- a/go.sum +++ b/go.sum @@ -1,20 +1,33 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= +github.com/Mzack9999/go-http-digest-auth-client v0.6.0 h1:LXVNMsj7qiNVmlZByFbjJmXf6SOm/uoo04XmnNcWPms= +github.com/Mzack9999/go-http-digest-auth-client v0.6.0/go.mod h1:gbwaYYXwA15ZfIxMyY5QU1acATDyNKEuG5TylBCL7AM= github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= -github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= -github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= -github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4= +github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E= +github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8= github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM= +github.com/antchfx/xmlquery v1.3.18 h1:FSQ3wMuphnPPGJOFhvc+cRQ2CT/rUj4cyQXkJcjOwz0= +github.com/antchfx/xmlquery v1.3.18/go.mod h1:Afkq4JIeXut75taLSuI31ISJ/zeq+3jG7TunF7noreA= github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= -github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk= github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/antchfx/xpath v1.2.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/antchfx/xpath v1.2.5 h1:hqZ+wtQ+KIOV/S3bGZcIhpgYC26um2bZYP2KVGcR7VY= +github.com/antchfx/xpath v1.2.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998 h1:2zipcnjfFdqAjOQa8otCCh0Lk1M7RBzciy3s80YAKHk= github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/cdproto v0.0.0-20231126232103-8e31ff06e23b h1:SywfM3985mh0PaXhiZbgR+VQFSfFXJSNLX2p+3vBvOc= +github.com/chromedp/cdproto v0.0.0-20231126232103-8e31ff06e23b/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/cdproto v0.0.0-20231205062650-00455a960d61 h1:XD280QPATe9jaz20dylKe3vBsNcH1w3mkssGY0lidn8= +github.com/chromedp/cdproto v0.0.0-20231205062650-00455a960d61/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= github.com/chromedp/chromedp v0.9.3 h1:Wq58e0dZOdHsxaj9Owmfcf+ibtpYN1N0FWVbaxa/esg= github.com/chromedp/chromedp v0.9.3/go.mod h1:NipeUkUcuzIdFbBP8eNNvl9upcceOfWzoJn6cRe4ksA= github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= @@ -31,14 +44,17 @@ github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= -github.com/gobwas/ws v1.3.0 h1:sbeU3Y4Qzlb+MOzIe6mQGf7QR4Hkv6ZD0qhGkBFL2O0= github.com/gobwas/ws v1.3.0/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= +github.com/gobwas/ws v1.3.1 h1:Qi34dfLMWJbiKaNbDVzM9x27nZBjmkaW6i4+Ku+pGVU= +github.com/gobwas/ws v1.3.1/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= +github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs= github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -49,15 +65,25 @@ github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrU github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= -github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8 h1:Y5Hsbpr9c5oK2l/ktfWdPOLBOx9MPlH7vRQNK1mJmiQ= -github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8/go.mod h1:GxYwOCC1RrHbilApc+wccYiaABLlRnnYaHcobxNDHos= +github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead h1:Iep2G2h3hSwc7w0qr1iVVAptgXqjn7DRXVQ33luPmhk= +github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead/go.mod h1:Faf/mOhyfNnLIfhoYj2vfPrjt0nKBr4WaU+OQ0C7r6U= +github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f h1:JAgZOIJ+UbkENpRiOTlfg51CW0UNrUkgwLjUGiH+x9g= +github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f/go.mod h1:S5J3E3Azva5+JKv67uc+Hh3XwLDvkVYDGjEaMTFrIqg= +github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53 h1:6pwdpEJoB1woSToh0cxLh5QirNOAp2z7DzvMKiaqdro= +github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53/go.mod h1:Fc2vfWpIVFWUmCv1S0xVsz3mIPYwdgsa6f2vCgL4CrA= +github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 h1:dpHAa9c74HgAXkZ2WPd84q2cCiF76eluuSGRw7bk7To= +github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440/go.mod h1:NlZ117o///yWDbRAbgYD7/Y44qce8z1Dj4caUsjunSY= github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= @@ -76,22 +102,29 @@ github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4/go.mod github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= -github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -99,40 +132,75 @@ golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= +golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.8.0 h1:n5xxQn2i3PC0yLAbjTpNT85q/Kgzcr2gIoX9OrJUols= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= +golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc= google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= +google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= @@ -147,7 +215,12 @@ google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzi google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA= google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/internal/configuration/configuration.go b/internal/configuration/configuration.go index 9ec9597..f17ddc9 100644 --- a/internal/configuration/configuration.go +++ b/internal/configuration/configuration.go @@ -3,23 +3,20 @@ package configuration import "github.com/logrusorgru/aurora/v3" const ( - NAME string = "xcrawl3r" - VERSION string = "0.1.0" - DESCRIPTION string = "A CLI utility to recursively crawl webpages." + NAME string = "xcrawl3r" + VERSION string = "0.1.0" ) -var ( - BANNER = aurora.Sprintf( - aurora.BrightBlue(` +var BANNER = aurora.Sprintf( + aurora.BrightBlue(` _ _____ __ _____ _ __ __ ___ _| |___ / _ __ \ \/ / __| '__/ _`+"`"+` \ \ /\ / / | |_ \| '__| > < (__| | | (_| |\ V V /| |___) | | -/_/\_\___|_| \__,_| \_/\_/ |_|____/|_| %s +/_/\_\___|_| \__,_| \_/\_/ |_|____/|_| + %s -%s -`).Bold(), - aurora.BrightYellow("v"+VERSION).Bold(), - aurora.BrightGreen(DESCRIPTION).Italic(), - ) + %s`).Bold(), + aurora.BrightRed("v"+VERSION).Bold(), + aurora.BrightYellow("with <3 by Hueristiq Open Source").Italic(), ) diff --git a/pkg/browser/browser.go b/pkg/browser/browser.go index 1037d10..395e590 100644 --- a/pkg/browser/browser.go +++ b/pkg/browser/browser.go @@ -5,29 +5,30 @@ import ( "log" "github.com/chromedp/chromedp" - hqlog "github.com/hueristiq/hqgoutils/log" + "github.com/hueristiq/hqgolog" ) -var GlobalContext context.Context -var GlobalCancel context.CancelFunc +var ( + GlobalContext context.Context + GlobalCancel context.CancelFunc +) func GetRenderedSource(url string) (outerHTML string) { // same browser, second tab newCtx, newCtxCancel := chromedp.NewContext(GlobalContext) + defer newCtxCancel() // ensure the second tab is created if err := chromedp.Run(newCtx); err != nil { newCtxCancel() - hqlog.Fatal().Msg(err.Error()) + + hqgolog.Fatal().Msg(err.Error()) } // navigate to a page, and get it's entire HTML - if err := chromedp.Run(newCtx, - chromedp.Navigate(url), - chromedp.OuterHTML("html", &outerHTML), - ); err != nil { - hqlog.Error().Msg(err.Error()) + if err := chromedp.Run(newCtx, chromedp.Navigate(url), chromedp.OuterHTML("html", &outerHTML)); err != nil { + hqgolog.Error().Msg(err.Error()) } return @@ -52,7 +53,7 @@ func GetGlobalContext(headless bool, proxy string) (ctx context.Context, cancel // ensure the first tab is created if err := chromedp.Run(ctx); err != nil { - hqlog.Fatal().Msg(err.Error()) + hqgolog.Fatal().Msg(err.Error()) } return diff --git a/pkg/xcrawl3r/page_strategy.go b/pkg/xcrawl3r/page_strategy.go index fa25a9d..a9bcb61 100644 --- a/pkg/xcrawl3r/page_strategy.go +++ b/pkg/xcrawl3r/page_strategy.go @@ -6,11 +6,11 @@ import ( "strings" "github.com/gocolly/colly/v2" - hqurl "github.com/hueristiq/hqgoutils/url" + "github.com/hueristiq/hqgourl" "github.com/hueristiq/xcrawl3r/pkg/browser" ) -func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { +func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) (URLsChannel chan URL) { URLsChannel = make(chan URL) go func() { @@ -48,11 +48,17 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { } }) + crawler.FileCollector.OnError(func(_ *colly.Response, err error) { + }) + + crawler.FileCollector.OnResponse(func(response *colly.Response) { + }) + crawler.PageCollector.OnHTML("[href]", func(e *colly.HTMLElement) { relativeURL := e.Attr("href") absoluteURL := e.Request.AbsoluteURL(relativeURL) - parsedAbsoluteURL, err := hqurl.Parse(absoluteURL) + parsedAbsoluteURL, err := hqgourl.Parse(absoluteURL) if err != nil { return } @@ -112,9 +118,7 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqurl.URL) (URLsChannel chan URL) { body := decode(string(response.Body)) URLs := crawler.URLsRegex.FindAllString(body, -1) - for index := range URLs { - fileURL := URLs[index] - + for _, fileURL := range URLs { // remove beginning and ending quotes fileURL = strings.Trim(fileURL, "\"") fileURL = strings.Trim(fileURL, "'") diff --git a/pkg/xcrawl3r/robots_strategy.go b/pkg/xcrawl3r/robots_strategy.go index bc8d8ef..3fb4b4d 100644 --- a/pkg/xcrawl3r/robots_strategy.go +++ b/pkg/xcrawl3r/robots_strategy.go @@ -3,14 +3,15 @@ package xcrawl3r import ( "fmt" "io" - "net/http" "regexp" "strings" - hqurl "github.com/hueristiq/hqgoutils/url" + "github.com/hueristiq/hqgohttp" + "github.com/hueristiq/hqgohttp/status" + "github.com/hueristiq/hqgourl" ) -func (crawler *Crawler) robotsParsing(parsedURL *hqurl.URL) (URLsChannel chan URL) { +func (crawler *Crawler) robotsParsing(parsedURL *hqgourl.URL) (URLsChannel chan URL) { URLsChannel = make(chan URL) go func() { @@ -18,14 +19,14 @@ func (crawler *Crawler) robotsParsing(parsedURL *hqurl.URL) (URLsChannel chan UR robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host) - res, err := http.Get(robotsURL) //nolint:gosec // Works! + res, err := hqgohttp.Get(robotsURL) if err != nil { return } defer res.Body.Close() - if res.StatusCode == 200 { + if res.StatusCode == status.OK { URLsChannel <- URL{Source: "known", Value: robotsURL} body, err := io.ReadAll(res.Body) diff --git a/pkg/xcrawl3r/sitemap_strategy.go b/pkg/xcrawl3r/sitemap_strategy.go index f1f6333..a92f367 100644 --- a/pkg/xcrawl3r/sitemap_strategy.go +++ b/pkg/xcrawl3r/sitemap_strategy.go @@ -3,11 +3,11 @@ package xcrawl3r import ( "fmt" - hqurl "github.com/hueristiq/hqgoutils/url" + "github.com/hueristiq/hqgourl" sitemap "github.com/oxffaa/gopher-parse-sitemap" ) -func (crawler *Crawler) sitemapParsing(parsedURL *hqurl.URL) (URLsChannel chan URL) { +func (crawler *Crawler) sitemapParsing(parsedURL *hqgourl.URL) (URLsChannel chan URL) { URLsChannel = make(chan URL) go func() { diff --git a/pkg/xcrawl3r/utils.go b/pkg/xcrawl3r/utils.go index 5d013fe..9c7890d 100644 --- a/pkg/xcrawl3r/utils.go +++ b/pkg/xcrawl3r/utils.go @@ -3,7 +3,7 @@ package xcrawl3r import ( "strings" - hqurl "github.com/hueristiq/hqgoutils/url" + "github.com/hueristiq/hqgourl" ) func decode(source string) (decodedSource string) { @@ -17,10 +17,10 @@ func decode(source string) (decodedSource string) { return } -func (crawler *Crawler) fixURL(parsedURL *hqurl.URL, URL string) (fixedURL string) { +func (crawler *Crawler) fixURL(parsedURL *hqgourl.URL, URL string) (fixedURL string) { // decode // this .... - if strings.HasPrefix(URL, "http") { //nolint:gocritic // Works! + if strings.HasPrefix(URL, "http") { // `http://google.com` OR `https://google.com` fixedURL = URL } else if strings.HasPrefix(URL, "//") { @@ -50,7 +50,7 @@ func (crawler *Crawler) fixURL(parsedURL *hqurl.URL, URL string) (fixedURL strin } func (crawler *Crawler) IsInScope(URL string) (isInScope bool) { - parsedURL, err := hqurl.Parse(URL) + parsedURL, err := hqgourl.Parse(URL) if err != nil { return } diff --git a/pkg/xcrawl3r/xcrawl3r.go b/pkg/xcrawl3r/xcrawl3r.go index f4f4d3f..275ad11 100644 --- a/pkg/xcrawl3r/xcrawl3r.go +++ b/pkg/xcrawl3r/xcrawl3r.go @@ -14,10 +14,11 @@ import ( "github.com/gocolly/colly/v2/debug" "github.com/gocolly/colly/v2/extensions" "github.com/gocolly/colly/v2/proxy" - hqurl "github.com/hueristiq/hqgoutils/url" + "github.com/hueristiq/hqgourl" + "github.com/hueristiq/xcrawl3r/internal/configuration" ) -type Options struct { //nolint:govet // To be refactored +type Options struct { Domain string IncludeSubdomains bool Seeds []string @@ -38,7 +39,7 @@ type Options struct { //nolint:govet // To be refactored Debug bool } -type Crawler struct { //nolint:govet // To be refactored +type Crawler struct { Domain string IncludeSubdomains bool Seeds []string @@ -65,6 +66,8 @@ type Crawler struct { //nolint:govet // To be refactored URLsRegex *regexp.Regexp } +var DefaultUserAgent = fmt.Sprintf("%s v%s (https://github.com/hueristiq/%s)", configuration.NAME, configuration.VERSION, configuration.NAME) + func New(options *Options) (crawler *Crawler, err error) { crawler = &Crawler{ Domain: options.Domain, @@ -89,7 +92,7 @@ func New(options *Options) (crawler *Crawler, err error) { crawler.URLsRegex = regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`) //nolint:gocritic // Works fine! - crawler.FileURLsRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`) //nolint:gocritic // Works fine! + crawler.FileURLsRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`) crawler.URLsNotToRequestRegex = regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`) @@ -132,7 +135,7 @@ func New(options *Options) (crawler *Crawler, err error) { var splitEntry []string - if strings.Contains(entry, ": ") { //nolint:gocritic // Works! + if strings.Contains(entry, ": ") { splitEntry = strings.SplitN(entry, ": ", 2) } else if strings.Contains(entry, ":") { splitEntry = strings.SplitN(entry, ":", 2) @@ -150,13 +153,17 @@ func New(options *Options) (crawler *Crawler, err error) { extensions.Referer(crawler.PageCollector) - switch ua := strings.ToLower(crawler.UserAgent); { - case strings.HasPrefix(ua, "mob"): - extensions.RandomMobileUserAgent(crawler.PageCollector) - case strings.HasPrefix(ua, "web"): - extensions.RandomUserAgent(crawler.PageCollector) - default: - crawler.PageCollector.UserAgent = crawler.UserAgent + if crawler.UserAgent == "" { + crawler.PageCollector.UserAgent = DefaultUserAgent + } else { + switch ua := strings.ToLower(crawler.UserAgent); { + case strings.HasPrefix(ua, "mob"): + extensions.RandomMobileUserAgent(crawler.PageCollector) + case strings.HasPrefix(ua, "web"): + extensions.RandomUserAgent(crawler.PageCollector) + default: + crawler.PageCollector.UserAgent = crawler.UserAgent + } } HTTPTransport := &http.Transport{ @@ -169,7 +176,7 @@ func New(options *Options) (crawler *Crawler, err error) { IdleConnTimeout: time.Duration(crawler.Timeout) * time.Second, TLSHandshakeTimeout: time.Duration(crawler.Timeout) * time.Second, TLSClientConfig: &tls.Config{ - InsecureSkipVerify: true, //nolint:gosec // Intended + InsecureSkipVerify: true, Renegotiation: tls.RenegotiateOnceAsClient, }, } @@ -179,16 +186,14 @@ func New(options *Options) (crawler *Crawler, err error) { CheckRedirect: func(req *http.Request, via []*http.Request) (err error) { nextLocation := req.Response.Header.Get("Location") - var parsedLocation *hqurl.URL + var parsedLocation *hqgourl.URL - parsedLocation, err = hqurl.Parse(nextLocation) + parsedLocation, err = hqgourl.Parse(nextLocation) if err != nil { return err } - if crawler.IncludeSubdomains && - (parsedLocation.Domain == crawler.Domain || - strings.HasSuffix(parsedLocation.Domain, "."+crawler.Domain)) { + if crawler.IncludeSubdomains && (parsedLocation.Domain == crawler.Domain || strings.HasSuffix(parsedLocation.Domain, "."+crawler.Domain)) { return nil } @@ -254,7 +259,7 @@ func (crawler *Crawler) Crawl() (URLsChannel chan URL) { defer URLsWG.Done() for seed := range seedsChannel { - parsedSeed, err := hqurl.Parse(seed) + parsedSeed, err := hqgourl.Parse(seed) if err != nil { continue } From f630443fdca55e2ea5296b1a7b9e60c1d84e04a8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 22 Dec 2023 05:53:48 +0000 Subject: [PATCH 06/13] chore(deps): bump actions/setup-go from 4 to 5 Bumps [actions/setup-go](https://github.com/actions/setup-go) from 4 to 5. - [Release notes](https://github.com/actions/setup-go/releases) - [Commits](https://github.com/actions/setup-go/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/setup-go dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/build-test.yml | 2 +- .github/workflows/lint-test.yml | 2 +- .github/workflows/release.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 5216c78..e2a67cb 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -25,7 +25,7 @@ jobs: steps: - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version: '>=1.21' - diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint-test.yml index 88f9cdc..a6c52d5 100644 --- a/.github/workflows/lint-test.yml +++ b/.github/workflows/lint-test.yml @@ -25,7 +25,7 @@ jobs: steps: - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version: '>=1.21' cache: false diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2334759..eff1f39 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,7 +14,7 @@ jobs: steps: - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version: '>=1.21' - From 8a986eea1e393b871da6e8dbc582fb2780d4c3c4 Mon Sep 17 00:00:00 2001 From: "Alex (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Mon, 14 Oct 2024 21:23:08 +0300 Subject: [PATCH 07/13] chore(*): - --- .dockerignore | 8 + .../workflows/{build-test.yml => build.yml} | 10 +- .../{codeql-analysis.yml => codeql.yml} | 26 +- .github/workflows/dockerhub-push.yaml | 44 +++ .github/workflows/{lint-test.yml => lint.yml} | 10 +- .github/workflows/release.yml | 4 +- .gitignore | 2 - .golangci.yaml | 373 ++++++++++++------ .vscode/extenstions.json | 5 + .vscode/settings.json | 6 + Dockerfile | 67 ++++ Makefile | 130 +++++- README.md | 35 +- cmd/xcrawl3r/main.go | 87 ++-- go.mod | 34 +- go.sum | 87 ++-- internal/configuration/configuration.go | 5 +- pkg/parser/sitemap/sitemap.go | 154 ++++++++ pkg/xcrawl3r/output.go | 6 - pkg/xcrawl3r/page_strategy.go | 85 +++- pkg/xcrawl3r/result.go | 19 + pkg/xcrawl3r/robots_strategy.go | 96 +++-- pkg/xcrawl3r/sitemap_strategy.go | 84 +++- pkg/xcrawl3r/utils.go | 12 +- pkg/xcrawl3r/xcrawl3r.go | 321 +++++++-------- 25 files changed, 1215 insertions(+), 495 deletions(-) create mode 100644 .dockerignore rename .github/workflows/{build-test.yml => build.yml} (83%) rename .github/workflows/{codeql-analysis.yml => codeql.yml} (70%) create mode 100644 .github/workflows/dockerhub-push.yaml rename .github/workflows/{lint-test.yml => lint.yml} (81%) create mode 100644 .vscode/extenstions.json create mode 100644 .vscode/settings.json create mode 100644 Dockerfile create mode 100644 pkg/parser/sitemap/sitemap.go delete mode 100644 pkg/xcrawl3r/output.go create mode 100644 pkg/xcrawl3r/result.go diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..ae1ecbc --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +.github +bin +.gitignore +.golangci.yaml +.goreleaser.yaml +CONTRIBUTING.md +LICENSE +README.md \ No newline at end of file diff --git a/.github/workflows/build-test.yml b/.github/workflows/build.yml similarity index 83% rename from .github/workflows/build-test.yml rename to .github/workflows/build.yml index e2a67cb..8447c01 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build.yml @@ -1,4 +1,4 @@ -name: 🔨 Build Test +name: 🔨 Build on: push: @@ -17,7 +17,7 @@ on: jobs: build: - name: Build Test + name: Build strategy: matrix: os: [ubuntu-latest, windows-latest, macOS-12] @@ -25,9 +25,9 @@ jobs: steps: - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v4 with: - go-version: '>=1.21' + go-version: '>=1.23' - name: Checkout the repository uses: actions/checkout@v4 @@ -42,4 +42,4 @@ jobs: - name: Go build run: go build -v . - working-directory: ./cmd/xsubfind3r \ No newline at end of file + working-directory: ./cmd/xcrawl3r \ No newline at end of file diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql.yml similarity index 70% rename from .github/workflows/codeql-analysis.yml rename to .github/workflows/codeql.yml index a978f59..5c29437 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql.yml @@ -1,4 +1,4 @@ -name: 🚨 CodeQL Analysis +name: 🚨 Analyze Code (CodeQL) on: push: @@ -17,30 +17,30 @@ on: jobs: analyze: - name: CodeQL Analysis + name: Analyze Code (CodeQL) + strategy: + fail-fast: false + matrix: + language: [ 'go' ] runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write - - strategy: - fail-fast: false - matrix: - language: [ 'go' ] - steps: - - - name: Checkout repository + - + name: Checkout the repository uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 \ No newline at end of file + uses: github/codeql-action/analyze@v3 \ No newline at end of file diff --git a/.github/workflows/dockerhub-push.yaml b/.github/workflows/dockerhub-push.yaml new file mode 100644 index 0000000..b8da853 --- /dev/null +++ b/.github/workflows/dockerhub-push.yaml @@ -0,0 +1,44 @@ +name: 🐋 DockerHub Push + +on: + workflow_run: + workflows: ["🎉 Release"] + types: + - completed + workflow_dispatch: + +jobs: + push: + name: DockerHub Push + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + attestations: write + id-token: write + steps: + - + name: Checkout + uses: actions/checkout@v4 + + - + name: Get Github tag + id: meta + run: | + curl --silent "https://api.github.com/repos/hueristiq/xcrawl3r/releases/latest" | jq -r .tag_name | xargs -I {} echo TAG={} >> $GITHUB_OUTPUT + + - + name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - + name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: true + tags: hueristiq/xcrawl3r:latest,hueristiq/xcrawl3r:${{ steps.meta.outputs.TAG }} diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint.yml similarity index 81% rename from .github/workflows/lint-test.yml rename to .github/workflows/lint.yml index a6c52d5..7904b01 100644 --- a/.github/workflows/lint-test.yml +++ b/.github/workflows/lint.yml @@ -1,4 +1,4 @@ -name: 💅 Lint Test +name: 💅🏻 Lint on: push: @@ -20,14 +20,14 @@ permissions: jobs: lint: - name: Lint Test + name: Lint runs-on: ubuntu-latest steps: - name: Set up Go uses: actions/setup-go@v5 with: - go-version: '>=1.21' + go-version: '>=1.23' cache: false - name: Checkout the repository @@ -36,8 +36,8 @@ jobs: fetch-depth: 0 - name: Run golangci-lint - uses: golangci/golangci-lint-action@v3 + uses: golangci/golangci-lint-action@v6 with: - version: v1.54.2 + version: v1.61.0 args: --timeout 5m working-directory: . \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index eff1f39..5e642f1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,9 +14,9 @@ jobs: steps: - name: Set up Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v4 with: - go-version: '>=1.21' + go-version: '>=1.23' - name: Checkout the repository uses: actions/checkout@v4 diff --git a/.gitignore b/.gitignore index 77a98d4..c5e82d7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1 @@ -# Executable - bin \ No newline at end of file diff --git a/.golangci.yaml b/.golangci.yaml index 2f183b5..0e6d4e6 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -1,122 +1,232 @@ +# Options for analysis running. run: + # Number of operating system threads (`GOMAXPROCS`) that can execute golangci-lint simultaneously. + # If it is explicitly set to 0 (i.e. not the default) then golangci-lint will automatically set the value to match Linux container CPU quota. + # Default: the number of logical CPUs in the machine + # concurrency: 4 # Timeout for analysis, e.g. 30s, 5m. # Default: 1m timeout: 5m + # Exit code when at least one issue was found. + # Default: 1 + issues-exit-code: 1 + # Include test files or not. + # Default: true + tests: true + # List of build tags, all linters use it. + # Default: [] + build-tags: [] + # If set, we pass it to "go list -mod={option}". From "go help modules": + # If invoked with -mod=readonly, the go command is disallowed from the implicit + # automatic updating of go.mod described above. Instead, it fails when any changes + # to go.mod are needed. This setting is most useful to check that go.mod does + # not need updates, such as in a continuous integration and testing system. + # If invoked with -mod=vendor, the go command assumes that the vendor + # directory holds the correct copies of dependencies and ignores + # the dependency descriptions in go.mod. + # + # Allowed values: readonly|vendor|mod + # Default: "" + modules-download-mode: readonly + # Allow multiple parallel golangci-lint instances running. + # If false, golangci-lint acquires file lock on start. + # Default: false + allow-parallel-runners: true + # Allow multiple golangci-lint instances running, but serialize them around a lock. + # If false, golangci-lint exits with an error if it fails to acquire file lock on start. + # Default: false + allow-serial-runners: true + # Define the Go version limit. + # Mainly related to generics support since go1.18. + # Default: use Go version from the go.mod file, fallback on the env var `GOVERSION`, fallback on 1.17 + go: '1.23' + +# output configuration options +output: + # The formats used to render issues. + # Formats: + # - `colored-line-number` + # - `line-number` + # - `json` + # - `colored-tab` + # - `tab` + # - `html` + # - `checkstyle` + # - `code-climate` + # - `junit-xml` + # - `junit-xml-extended` + # - `github-actions` + # - `teamcity` + # - `sarif` + # Output path can be either `stdout`, `stderr` or path to the file to write to. + # + # For the CLI flag (`--out-format`), multiple formats can be specified by separating them by comma. + # The output can be specified for each of them by separating format name and path by colon symbol. + # Example: "--out-format=checkstyle:report.xml,json:stdout,colored-line-number" + # The CLI flag (`--out-format`) override the configuration file. + # + # Default: + # formats: + # - format: colored-line-number + # path: stdout + formats: + # - + # format: json + # path: stderr + # - + # format: checkstyle + # path: report.xml + - + format: colored-line-number + path: stderr + # Print lines of code with issue. + # Default: true + print-issued-lines: true + # Print linter name in the end of issue text. + # Default: true + print-linter-name: true + # Make issues output unique by line. + # Default: true + uniq-by-line: false + # Add a prefix to the output file references. + # Default: "" + path-prefix: "" + # Sort results by the order defined in `sort-order`. + # Default: false + sort-results: true + # Order to use when sorting results. + # Require `sort-results` to `true`. + # Possible values: `file`, `linter`, and `severity`. + # + # If the severity values are inside the following list, they are ordered in this order: + # 1. error + # 2. warning + # 3. high + # 4. medium + # 5. low + # Either they are sorted alphabetically. + # + # Default: ["file"] + sort-order: + - linter + - severity + - file # filepath, line, and column. + # Show statistics per linter. + # Default: false + show-stats: false linters: # Disable all linters. # Default: false disable-all: true # Enable specific linter + # https://golangci-lint.run/usage/linters/#enabled-by-default enable: - # Enabled by Default - - errcheck # errcheck is a program for checking for unchecked errors in Go code. These unchecked errors can be critical bugs in some cases [fast: false, auto-fix: false] - - gosimple # (megacheck) # Linter for Go source code that specializes in simplifying code [fast: false, auto-fix: false] - - govet # (vet, vetshadow) # Vet examines Go source code and reports suspicious constructs, such as Printf calls whose arguments do not align with the format string [fast: false, auto-fix: false] - - ineffassign # Detects when assignments to existing variables are not used [fast: true, auto-fix: false] - - staticcheck # (megacheck) # It's a set of rules from staticcheck. It's not the same thing as the staticcheck binary. The author of staticcheck doesn't support or approve the use of staticcheck as a library inside golangci-lint. [fast: false, auto-fix: false] - - unused # (megacheck) # Checks Go code for unused constants, variables, functions and types [fast: false, auto-fix: false] - # Disabled by Default - - asasalint # check for pass []any as any in variadic func(...any) [fast: false, auto-fix: false] - - asciicheck # Simple linter to check that your code does not contain non-ASCII identifiers [fast: true, auto-fix: false] - - bidichk # Checks for dangerous unicode character sequences [fast: true, auto-fix: false] - - bodyclose # checks whether HTTP response body is closed successfully [fast: false, auto-fix: false] - - containedctx # containedctx is a linter that detects struct contained context.Context field [fast: false, auto-fix: false] - - contextcheck # check whether the function uses a non-inherited context [fast: false, auto-fix: false] - # - cyclop # checks function and package cyclomatic complexity [fast: false, auto-fix: false] - # - deadcode # [deprecated] # Finds unused code [fast: false, auto-fix: false] - - decorder # check declaration order and count of types, constants, variables and functions [fast: true, auto-fix: false] - # - depguard # Go linter that checks if package imports are in a list of acceptable packages [fast: true, auto-fix: false] - - dogsled # Checks assignments with too many blank identifiers (e.g. x, _, _, _, := f()) [fast: true, auto-fix: false] - # - dupl # Tool for code clone detection [fast: true, auto-fix: false] - - dupword # checks for duplicate words in the source code [fast: true, auto-fix: true] - - durationcheck # check for two durations multiplied together [fast: false, auto-fix: false] - - errchkjson # Checks types passed to the json encoding functions. Reports unsupported types and optionally reports occasions, where the check for the returned error can be omitted. [fast: false, auto-fix: false] - - errname # Checks that sentinel errors are prefixed with the `Err` and error types are suffixed with the `Error`. [fast: false, auto-fix: false] - - errorlint # errorlint is a linter for that can be used to find code that will cause problems with the error wrapping scheme introduced in Go 1.13. [fast: false, auto-fix: false] - - execinquery # execinquery is a linter about query string checker in Query function which reads your Go src files and warning it finds [fast: false, auto-fix: false] - - exhaustive # check exhaustiveness of enum switch statements [fast: false, auto-fix: false] - # - exhaustivestruct # [deprecated] # Checks if all struct's fields are initialized [fast: false, auto-fix: false] - # - exhaustruct # Checks if all structure fields are initialized [fast: false, auto-fix: false] - - exportloopref # checks for pointers to enclosing loop variables [fast: false, auto-fix: false] - # - forbidigo # Forbids identifiers [fast: false, auto-fix: false] - - forcetypeassert # finds forced type assertions [fast: true, auto-fix: false] - # - funlen # Tool for detection of long functions [fast: true, auto-fix: false] - - gci # Gci controls Go package import order and makes it always deterministic. [fast: true, auto-fix: false] - - ginkgolinter # enforces standards of using ginkgo and gomega [fast: false, auto-fix: false] - - gocheckcompilerdirectives # Checks that go compiler directive comments (//go:) are valid. [fast: true, auto-fix: false] - # - gochecknoglobals # check that no global variables exist [fast: false, auto-fix: false] - # - gochecknoinits # Checks that no init functions are present in Go code [fast: true, auto-fix: false] - # - gocognit # Computes and checks the cognitive complexity of functions [fast: true, auto-fix: false] - - goconst # Finds repeated strings that could be replaced by a constant [fast: true, auto-fix: false] - - gocritic # Provides diagnostics that check for bugs, performance and style issues. [fast: false, auto-fix: false] - # - gocyclo # Computes and checks the cyclomatic complexity of functions [fast: true, auto-fix: false] - # - godot # Check if comments end in a period [fast: true, auto-fix: true] - # - godox # Tool for detection of FIXME, TODO and other comment keywords [fast: true, auto-fix: false] - # - goerr113 # Go linter to check the errors handling expressions [fast: false, auto-fix: false] - - gofmt # Gofmt checks whether code was gofmt-ed. By default this tool runs with -s option to check for code simplification [fast: true, auto-fix: true] - - gofumpt # Gofumpt checks whether code was gofumpt-ed. [fast: true, auto-fix: true] - - goheader # Checks is file header matches to pattern [fast: true, auto-fix: false] - - goimports # Check import statements are formatted according to the 'goimport' command. Reformat imports in autofix mode. [fast: true, auto-fix: true] - # - golint # [deprecated] # Golint differs from gofmt. Gofmt reformats Go source code, whereas golint prints out style mistakes [fast: false, auto-fix: false] - # - gomnd # An analyzer to detect magic numbers. [fast: true, auto-fix: false] - - gomoddirectives # Manage the use of 'replace', 'retract', and 'excludes' directives in go.mod. [fast: true, auto-fix: false] - - gomodguard # Allow and block list linter for direct Go module dependencies. This is different from depguard where there are different block types for example version constraints and module recommendations. [fast: true, auto-fix: false] - - goprintffuncname # Checks that printf-like functions are named with `f` at the end [fast: true, auto-fix: false] - # - gosec # (gas) # Inspects source code for security problems [fast: false, auto-fix: false] - - gosmopolitan # Report certain i18n/l10n anti-patterns in your Go codebase [fast: false, auto-fix: false] - - grouper # An analyzer to analyze expression groups. [fast: true, auto-fix: false] - # - ifshort # [deprecated] # Checks that your code uses short syntax for if-statements whenever possible [fast: true, auto-fix: false] - - importas # Enforces consistent import aliases [fast: false, auto-fix: false] - - interfacebloat # A linter that checks the number of methods inside an interface. [fast: true, auto-fix: false] - # - interfacer # [deprecated] # Linter that suggests narrower interface types [fast: false, auto-fix: false] - # - ireturn # Accept Interfaces, Return Concrete Types [fast: false, auto-fix: false] - # - lll # Reports long lines [fast: true, auto-fix: false] - - loggercheck # (logrlint) # Checks key value pairs for common logger libraries (kitlog,klog,logr,zap). [fast: false, auto-fix: false] - - maintidx # maintidx measures the maintainability index of each function. [fast: true, auto-fix: false] - - makezero # Finds slice declarations with non-zero initial length [fast: false, auto-fix: false] - # - maligned # [deprecated] # Tool to detect Go structs that would take less memory if their fields were sorted [fast: false, auto-fix: false] - - mirror # reports wrong mirror patterns of bytes/strings usage [fast: false, auto-fix: false] - - misspell # Finds commonly misspelled English words in comments [fast: true, auto-fix: true] - - musttag # enforce field tags in (un)marshaled structs [fast: false, auto-fix: false] - # - nakedret # Finds naked returns in functions greater than a specified function length [fast: true, auto-fix: false] - - nestif # Reports deeply nested if statements [fast: true, auto-fix: false] - - nilerr # Finds the code that returns nil even if it checks that the error is not nil. [fast: false, auto-fix: false] - - nilnil # Checks that there is no simultaneous return of `nil` error and an invalid value. [fast: false, auto-fix: false] - - nlreturn # nlreturn checks for a new line before return and branch statements to increase code clarity [fast: true, auto-fix: false] - - noctx # noctx finds sending http request without context.Context [fast: false, auto-fix: false] - - nolintlint # Reports ill-formed or insufficient nolint directives [fast: true, auto-fix: false] - # - nonamedreturns # Reports all named returns [fast: false, auto-fix: false] - # - nosnakecase # [deprecated] # nosnakecase is a linter that detects snake case of variable naming and function name. [fast: true, auto-fix: false] - - nosprintfhostport # Checks for misuse of Sprintf to construct a host with port in a URL. [fast: true, auto-fix: false] - - paralleltest # paralleltest detects missing usage of t.Parallel() method in your Go test [fast: false, auto-fix: false] - - prealloc # Finds slice declarations that could potentially be pre-allocated [fast: true, auto-fix: false] - - predeclared # find code that shadows one of Go's predeclared identifiers [fast: true, auto-fix: false] - # - promlinter # Check Prometheus metrics naming via promlint [fast: true, auto-fix: false] - - reassign # Checks that package variables are not reassigned [fast: false, auto-fix: false] - - revive # Fast, configurable, extensible, flexible, and beautiful linter for Go. Drop-in replacement of golint. [fast: false, auto-fix: false] - - rowserrcheck # checks whether Err of rows is checked successfully [fast: false, auto-fix: false] - # - scopelint # [deprecated] # Scopelint checks for unpinned variables in go programs [fast: true, auto-fix: false] - - sqlclosecheck # Checks that sql.Rows and sql.Stmt are closed. [fast: false, auto-fix: false] - # - structcheck # [deprecated] # Finds unused struct fields [fast: false, auto-fix: false] - - stylecheck # Stylecheck is a replacement for golint [fast: false, auto-fix: false] - - tagalign # check that struct tags are well aligned [fast: true, auto-fix: true] - # - tagliatelle # Checks the struct tags. [fast: true, auto-fix: false] - - tenv # tenv is analyzer that detects using os.Setenv instead of t.Setenv since Go1.17 [fast: false, auto-fix: false] - - testableexamples # linter checks if examples are testable (have an expected output) [fast: true, auto-fix: false] - - testpackage # linter that makes you use a separate _test package [fast: true, auto-fix: false] - - thelper # thelper detects Go test helpers without t.Helper() call and checks the consistency of test helpers [fast: false, auto-fix: false] - - tparallel # tparallel detects inappropriate usage of t.Parallel() method in your Go test codes [fast: false, auto-fix: false] - - unconvert # Remove unnecessary type conversions [fast: false, auto-fix: false] - - unparam # Reports unused function parameters [fast: false, auto-fix: false] - - usestdlibvars # A linter that detect the possibility to use variables/constants from the Go standard library. [fast: true, auto-fix: false] - # - varcheck # [deprecated] # Finds unused global variables and constants [fast: false, auto-fix: false] - # - varnamelen # checks that the length of a variable's name matches its scope [fast: false, auto-fix: false] - - wastedassign # wastedassign finds wasted assignment statements. [fast: false, auto-fix: false] - - whitespace # Tool for detection of leading and trailing whitespace [fast: true, auto-fix: true] - # - wrapcheck # Checks that errors returned from external packages are wrapped [fast: false, auto-fix: false] - - wsl # Whitespace Linter - Forces you to use empty lines! [fast: true, auto-fix: false] - - zerologlint # Detects the wrong usage of `zerolog` that a user forgets to dispatch with `Send` or `Msg`. [fast: false, auto-fix: false] + - asasalint + - asciicheck + - bidichk + - bodyclose + - canonicalheader + - containedctx + - contextcheck + - copyloopvar + # - cyclop + - decorder + # - depguard + - dogsled + - dupl + - dupword + - durationcheck + - err113 + - errcheck + - errchkjson + - errname + - errorlint + - exhaustive + # - exhaustruct + - fatcontext + - forbidigo + - forcetypeassert + # - funlen + - gci + - ginkgolinter + - gocheckcompilerdirectives + # - gochecknoglobals + # - gochecknoinits + - gochecksumtype + # - gocognit + - goconst + - gocritic + # - gocyclo + - godot + - godox + - gofmt + - gofumpt + - goheader + - goimports + - gomoddirectives + - gomodguard + - goprintffuncname + - gosec + - gosimple + - gosmopolitan + - govet + - grouper + - importas + - inamedparam + - ineffassign + - interfacebloat + - intrange + - ireturn + # - lll + - loggercheck + - maintidx + - makezero + - mirror + - misspell + # - mnd + - musttag + # - nakedret + - nestif + - nilerr + - nilnil + - nlreturn + - noctx + - nolintlint + # - nonamedreturns + - nosprintfhostport + - paralleltest + # - perfsprint + - prealloc + - predeclared + - promlinter + - protogetter + - reassign + - revive + - rowserrcheck + - sloglint + - spancheck + - sqlclosecheck + - staticcheck + - stylecheck + - tagalign + # - tagliatelle + - tenv + - testableexamples + - testifylint + - testpackage + - thelper + - tparallel + - unconvert + - unparam + - unused + - usestdlibvars + # - varnamelen + - wastedassign + - whitespace + - wrapcheck + - wsl + - zerologlint linters-settings: goconst: @@ -135,26 +245,39 @@ linters-settings: # Minimal code complexity to report. # Default: 30 (but we recommend 10-20) min-complexity: 10 - govet: - check-shadowing: true - varnamelen: - # The minimum length of a variable's name that is considered "long". - # Variable names that are at least this long will be ignored. - # Default: 3 - min-name-length: 2 - # Check method receivers. - # Default: false - check-receiver: true - # Check named return values. - # Default: false - check-return: true - # Check type parameters. - # Default: false - check-type-param: true + # varnamelen: + # # The minimum length of a variable's name that is considered "long". + # # Variable names that are at least this long will be ignored. + # # Default: 3 + # min-name-length: 2 + # # Check method receivers. + # # Default: false + # check-receiver: true + # # Check named return values. + # # Default: false + # check-return: true + # # Check type parameters. + # # Default: false + # check-type-param: true whitespace: # Enforces newlines (or comments) after every multi-line if statement. # Default: false multi-if: true # Enforces newlines (or comments) after every multi-line function signature. # Default: false - multi-func: true \ No newline at end of file + multi-func: true + +issues: + # Which dirs to exclude: issues from them won't be reported. + # Can use regexp here: `generated.*`, regexp is applied on full path, + # including the path prefix if one is set. + # Default dirs are skipped independently of this option's value (see exclude-dirs-use-default). + # "/" will be replaced by current OS file path separator to properly work on Windows. + # Default: [] + exclude-dirs: [] + # Show issues in any part of update files (requires new-from-rev or new-from-patch). + # Default: false + whole-files: false + # Fix found issues (if it's supported by the linter). + # Default: false + fix: true \ No newline at end of file diff --git a/.vscode/extenstions.json b/.vscode/extenstions.json new file mode 100644 index 0000000..7203cb3 --- /dev/null +++ b/.vscode/extenstions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "golang.go" + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1a653cd --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "go.lintTool": "golangci-lint", + "go.lintFlags": [ + "--fast" + ] +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6a6c55f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,67 @@ +# Use the official Golang image version 1.23 with the Alpine distribution as the base image for the build stage. +# This multi-stage build starts with the "build-stage" stage where the Go application will be compiled. +FROM golang:1.23.1-alpine3.20 AS build-stage + +# Perform system updates and install necessary packages. +# - `apk --no-cache update`: Updates the Alpine package repository without caching index files. +# - `apk --no-cache upgrade`: Upgrades all installed packages to the latest available versions. +# - `apk --no-cache add`: Installs additional required packages: +# - `ca-certificates`: For managing CA certificates for secure communication. +# - `curl`: For making HTTP requests (can be used to download files or for health checks). +# - `gcc` and `g++`: The GNU Compiler Collection used for compiling C and C++ code, essential for building Go applications. +# - `git`: Required for downloading Go modules that reference external repositories. +# - `make`: Utility for automating build processes and running the `Makefile`. +RUN < [!CAUTION] > While the development version is a good way to take a peek at `xcrawl3r`'s latest features before they get released, be aware that it may have bugs. Officially released versions will generally be more stable. +### Install on Docker (With Docker Installed) + +To install `xcrawl3r` on docker: + +* Pull the docker image using: + + ```bash + docker pull hueristiq/xcrawl3r:latest + ``` + +* Run `xcrawl3r` using the image: + + ```bash + docker run --rm hueristiq/xcrawl3r:latest -h + ``` + ## Usage -To display help message for `xcrawl3r` use the `-h` flag: +To start using `xcrawl3r`, open your terminal and run the following command for a list of options: ```bash xcrawl3r -h ``` -help message: +Here's what the help message looks like: ```text _ _____ @@ -168,22 +185,22 @@ OUTPUT: ## Contributing -[Issues](https://github.com/hueristiq/xcrawl3r/issues) and [Pull Requests](https://github.com/hueristiq/xcrawl3r/pulls) are welcome! **Check out the [contribution guidelines](https://github.com/hueristiq/xcrawl3r/blob/master/CONTRIBUTING.md).** +We welcome contributions! Feel free to submit [Pull Requests](https://github.com/hueristiq/xcrawl3r/pulls) or report [Issues](https://github.com/hueristiq/xcrawl3r/issues). For more details, check out the [contribution guidelines](https://github.com/hueristiq/xcrawl3r/blob/master/CONTRIBUTING.md). ## Licensing -This utility is distributed under the [MIT license](https://github.com/hueristiq/xcrawl3r/blob/master/LICENSE). +This utility is licensed under the [MIT license](https://opensource.org/license/mit). You are free to use, modify, and distribute it, as long as you follow the terms of the license. You can find the full license text in the repository - [Full MIT license text](https://github.com/hueristiq/xcrawl3r/blob/master/LICENSE). ## Credits ### Contributors -Thanks to the amazing [contributors](https://github.com/hueristiq/xcrawl3r/graphs/contributors) for keeping this project alive. +A huge thanks to all the contributors who have helped make `xcrawl3r` what it is today! [![contributors](https://contrib.rocks/image?repo=hueristiq/xcrawl3r&max=500)](https://github.com/hueristiq/xcrawl3r/graphs/contributors) ### Similar Projects -Thanks to similar open source projects - check them out, may fit in your workflow. +If you're interested in more utilities like this, check out: -[katana](https://github.com/projectdiscovery/katana) ◇ [gospider](https://github.com/jaeles-project/gospider) ◇ [hakrawler](https://github.com/hakluke/hakrawler) ◇ [urlgrab](https://github.com/IAmStoxe/urlgrab) \ No newline at end of file +[gospider](https://github.com/jaeles-project/gospider) ◇ [hakrawler](https://github.com/hakluke/hakrawler) ◇ [katana](https://github.com/projectdiscovery/katana) ◇ [urlgrab](https://github.com/IAmStoxe/urlgrab) \ No newline at end of file diff --git a/cmd/xcrawl3r/main.go b/cmd/xcrawl3r/main.go index 1e8e738..368870d 100644 --- a/cmd/xcrawl3r/main.go +++ b/cmd/xcrawl3r/main.go @@ -6,11 +6,12 @@ import ( "io/fs" "os" "path/filepath" + "strings" + hqgourl "github.com/hueristiq/hq-go-url" "github.com/hueristiq/hqgolog" "github.com/hueristiq/hqgolog/formatter" "github.com/hueristiq/hqgolog/levels" - "github.com/hueristiq/hqgourl" "github.com/hueristiq/xcrawl3r/internal/configuration" "github.com/hueristiq/xcrawl3r/pkg/xcrawl3r" "github.com/logrusorgru/aurora/v3" @@ -41,7 +42,9 @@ var ( debug bool monochrome bool output string - verbosity string + + silent bool + verbose bool ) func init() { @@ -67,7 +70,9 @@ func init() { pflag.BoolVar(&debug, "debug", false, "") pflag.BoolVarP(&monochrome, "monochrome", "m", false, "") pflag.StringVarP(&output, "output", "o", "", "") - pflag.StringVarP(&verbosity, "verbosity", "v", string(levels.LevelInfo), "") + + pflag.BoolVar(&silent, "silent", false, "") + pflag.BoolVarP(&verbose, "verbose", "v", false, "") pflag.CommandLine.SortFlags = false pflag.Usage = func() { @@ -108,7 +113,8 @@ func init() { h += " --debug bool enable debug mode (default: false)\n" h += " -m, --monochrome bool coloring: no colored output mode\n" h += " -o, --output string output file to write found URLs\n" - h += " -v, --verbosity string debug, info, warning, error, fatal or silent (default: debug)\n" + h += " --silent bool display output URLs only\n" + h += " -v, --verbose bool display verbose output\n" fmt.Fprint(os.Stderr, h) } @@ -116,7 +122,12 @@ func init() { pflag.Parse() // Initialize logger - hqgolog.DefaultLogger.SetMaxLevel(levels.LevelStr(verbosity)) + hqgolog.DefaultLogger.SetMaxLevel(levels.LevelInfo) + + if verbose { + hqgolog.DefaultLogger.SetMaxLevel(levels.LevelDebug) + } + hqgolog.DefaultLogger.SetFormatter(formatter.NewCLI(&formatter.CLIOptions{ Colorize: !monochrome, })) @@ -125,14 +136,18 @@ func init() { } func main() { - if verbosity != string(levels.LevelSilent) { + if !silent { fmt.Fprintln(os.Stderr, configuration.BANNER) } + hqgolog.Print().Msg("") + if seedsFile != "" && URL == "" && domain == "" { hqgolog.Fatal().Msg("using `-s, --seeds` requires either `-d, --domain` or `-u, --url` to be set!") } + up := hqgourl.NewParser() + // Load input URLs seeds := []string{} @@ -140,7 +155,13 @@ func main() { seeds = append(seeds, URL) if domain == "" { - domain = URL + parsed, err := up.Parse(URL) + if err != nil { + hqgolog.Fatal().Msg(err.Error()) + } + + domain = parsed.Domain.String() + domain = strings.TrimPrefix(domain, "www.") } } @@ -187,13 +208,8 @@ func main() { } } - parsedURL, err := hqgourl.Parse(domain) - if err != nil { - hqgolog.Fatal().Msgf("%s", err) - } - - options := &xcrawl3r.Options{ - Domain: parsedURL.Domain, + cfg := &xcrawl3r.Configuration{ + Domain: domain, IncludeSubdomains: includeSubdomains, Seeds: seeds, @@ -213,12 +229,12 @@ func main() { Debug: debug, } - crawler, err := xcrawl3r.New(options) + crawler, err := xcrawl3r.New(cfg) if err != nil { - hqgolog.Fatal().Msgf("%s", err) + hqgolog.Fatal().Msg(err.Error()) } - URLs := crawler.Crawl() + var writer *bufio.Writer if output != "" { directory := filepath.Dir(output) @@ -229,34 +245,37 @@ func main() { } } - file, err := os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) + var file *os.File + + file, err = os.OpenFile(output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) if err != nil { hqgolog.Fatal().Msg(err.Error()) } defer file.Close() - writer := bufio.NewWriter(file) + writer = bufio.NewWriter(file) + } - for outputURL := range URLs { - if verbosity == string(levels.LevelSilent) { - hqgolog.Print().Msg(outputURL.Value) + for URL := range crawler.Crawl() { + switch URL.Type { + case xcrawl3r.ResultError: + if verbose { + hqgolog.Error().Msgf("%s: %s\n", URL.Source, URL.Error) + } + case xcrawl3r.ResultURL: + if verbose { + hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(URL.Source), URL.Value) } else { - hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(outputURL.Source), outputURL.Value) + hqgolog.Print().Msg(URL.Value) } - fmt.Fprintln(writer, outputURL.Value) - } + if writer != nil { + fmt.Fprintln(writer, URL.Value) - if err = writer.Flush(); err != nil { - hqgolog.Fatal().Msg(err.Error()) - } - } else { - for outputURL := range URLs { - if verbosity == string(levels.LevelSilent) { - hqgolog.Print().Msg(outputURL.Value) - } else { - hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(outputURL.Source), outputURL.Value) + if err := writer.Flush(); err != nil { + hqgolog.Fatal().Msg(err.Error()) + } } } } diff --git a/go.mod b/go.mod index af13f08..448675c 100644 --- a/go.mod +++ b/go.mod @@ -1,43 +1,43 @@ module github.com/hueristiq/xcrawl3r -go 1.21.0 +go 1.23.1 require ( - github.com/chromedp/chromedp v0.9.3 + github.com/chromedp/chromedp v0.10.0 github.com/gocolly/colly/v2 v2.1.0 - github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead + github.com/hueristiq/hq-go-http v0.0.0-20241013111525-65cf25fd7246 + github.com/hueristiq/hq-go-url v0.0.0-20241006190408-b9120b7b7d91 github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f - github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53 github.com/logrusorgru/aurora/v3 v3.0.0 - github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 github.com/spf13/pflag v1.0.5 ) require ( github.com/Mzack9999/go-http-digest-auth-client v0.6.0 // indirect - github.com/PuerkitoBio/goquery v1.8.1 // indirect + github.com/PuerkitoBio/goquery v1.10.0 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect - github.com/antchfx/htmlquery v1.3.0 // indirect - github.com/antchfx/xmlquery v1.3.18 // indirect - github.com/antchfx/xpath v1.2.5 // indirect - github.com/chromedp/cdproto v0.0.0-20231205062650-00455a960d61 // indirect + github.com/antchfx/htmlquery v1.3.3 // indirect + github.com/antchfx/xmlquery v1.4.2 // indirect + github.com/antchfx/xpath v1.3.2 // indirect + github.com/chromedp/cdproto v0.0.0-20241003230502-a4a8f7c660df // indirect github.com/chromedp/sysutil v1.0.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect - github.com/gobwas/ws v1.3.1 // indirect + github.com/gobwas/ws v1.4.0 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect - github.com/golang/protobuf v1.5.3 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/hueristiq/hq-go-retrier v0.0.0-20241006185156-b525dc69639c // indirect github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect github.com/temoto/robotstxt v1.1.2 // indirect - golang.org/x/net v0.19.0 // indirect - golang.org/x/sys v0.15.0 // indirect - golang.org/x/term v0.15.0 // indirect - golang.org/x/text v0.14.0 // indirect + golang.org/x/net v0.30.0 // indirect + golang.org/x/sys v0.26.0 // indirect + golang.org/x/term v0.25.0 // indirect + golang.org/x/text v0.19.0 // indirect google.golang.org/appengine v1.6.8 // indirect - google.golang.org/protobuf v1.31.0 // indirect + google.golang.org/protobuf v1.35.1 // indirect ) diff --git a/go.sum b/go.sum index d011f13..264abd6 100644 --- a/go.sum +++ b/go.sum @@ -3,33 +3,28 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03 github.com/Mzack9999/go-http-digest-auth-client v0.6.0 h1:LXVNMsj7qiNVmlZByFbjJmXf6SOm/uoo04XmnNcWPms= github.com/Mzack9999/go-http-digest-auth-client v0.6.0/go.mod h1:gbwaYYXwA15ZfIxMyY5QU1acATDyNKEuG5TylBCL7AM= github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= -github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= -github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= +github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4= +github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= -github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= -github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E= -github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8= +github.com/antchfx/htmlquery v1.3.3 h1:x6tVzrRhVNfECDaVxnZi1mEGrQg3mjE/rxbH2Pe6dNE= +github.com/antchfx/htmlquery v1.3.3/go.mod h1:WeU3N7/rL6mb6dCwtE30dURBnBieKDC/fR8t6X+cKjU= github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM= -github.com/antchfx/xmlquery v1.3.18 h1:FSQ3wMuphnPPGJOFhvc+cRQ2CT/rUj4cyQXkJcjOwz0= -github.com/antchfx/xmlquery v1.3.18/go.mod h1:Afkq4JIeXut75taLSuI31ISJ/zeq+3jG7TunF7noreA= +github.com/antchfx/xmlquery v1.4.2 h1:MZKd9+wblwxfQ1zd1AdrTsqVaMjMCwow3IqkCSe00KA= +github.com/antchfx/xmlquery v1.4.2/go.mod h1:QXhvf5ldTuGqhd1SHNvvtlhhdQLks4dD0awIVhXIDTA= github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= -github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= -github.com/antchfx/xpath v1.2.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= -github.com/antchfx/xpath v1.2.5 h1:hqZ+wtQ+KIOV/S3bGZcIhpgYC26um2bZYP2KVGcR7VY= -github.com/antchfx/xpath v1.2.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U= +github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= -github.com/chromedp/cdproto v0.0.0-20231126232103-8e31ff06e23b h1:SywfM3985mh0PaXhiZbgR+VQFSfFXJSNLX2p+3vBvOc= -github.com/chromedp/cdproto v0.0.0-20231126232103-8e31ff06e23b/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= -github.com/chromedp/cdproto v0.0.0-20231205062650-00455a960d61 h1:XD280QPATe9jaz20dylKe3vBsNcH1w3mkssGY0lidn8= -github.com/chromedp/cdproto v0.0.0-20231205062650-00455a960d61/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= -github.com/chromedp/chromedp v0.9.3 h1:Wq58e0dZOdHsxaj9Owmfcf+ibtpYN1N0FWVbaxa/esg= -github.com/chromedp/chromedp v0.9.3/go.mod h1:NipeUkUcuzIdFbBP8eNNvl9upcceOfWzoJn6cRe4ksA= +github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/cdproto v0.0.0-20241003230502-a4a8f7c660df h1:cbtSn19AtqQha1cxmP2Qvgd3fFMz51AeAEKLJMyEUhc= +github.com/chromedp/cdproto v0.0.0-20241003230502-a4a8f7c660df/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E= +github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE= github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= @@ -44,10 +39,8 @@ github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= -github.com/gobwas/ws v1.3.0/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= -github.com/gobwas/ws v1.3.1 h1:Qi34dfLMWJbiKaNbDVzM9x27nZBjmkaW6i4+Ku+pGVU= -github.com/gobwas/ws v1.3.1/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= -github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= +github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= +github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs= github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= @@ -68,20 +61,22 @@ github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QD github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead h1:Iep2G2h3hSwc7w0qr1iVVAptgXqjn7DRXVQ33luPmhk= -github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead/go.mod h1:Faf/mOhyfNnLIfhoYj2vfPrjt0nKBr4WaU+OQ0C7r6U= +github.com/hueristiq/hq-go-http v0.0.0-20241013111525-65cf25fd7246 h1:7vtPVtFXzDdeHxQyNpn+GL/0389EtOD+M/Nxf4ByVkA= +github.com/hueristiq/hq-go-http v0.0.0-20241013111525-65cf25fd7246/go.mod h1:HNq1LSRIFndL7YzB17UVatJxsbWQFZkaaJvJv5AYWX0= +github.com/hueristiq/hq-go-retrier v0.0.0-20241006185156-b525dc69639c h1:5QeToyM/D2hQzXdlQ+PiF24xbNcHETMLF1ZRR+rYDYI= +github.com/hueristiq/hq-go-retrier v0.0.0-20241006185156-b525dc69639c/go.mod h1:YkxIHoJHsL0wmzQ3tc0qz4UTr9q9eCicUt5RvMV//xw= +github.com/hueristiq/hq-go-url v0.0.0-20241006190408-b9120b7b7d91 h1:CIHiYD3YJFlCDqZlXxD/zH6xnMcF3/XC7yGeR3DdLso= +github.com/hueristiq/hq-go-url v0.0.0-20241006190408-b9120b7b7d91/go.mod h1:oDQP/s9eYGtsTz7LrbCC9CKbq6TmfB9b7yAoPCIqqyQ= github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f h1:JAgZOIJ+UbkENpRiOTlfg51CW0UNrUkgwLjUGiH+x9g= github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f/go.mod h1:S5J3E3Azva5+JKv67uc+Hh3XwLDvkVYDGjEaMTFrIqg= -github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53 h1:6pwdpEJoB1woSToh0cxLh5QirNOAp2z7DzvMKiaqdro= -github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53/go.mod h1:Fc2vfWpIVFWUmCv1S0xVsz3mIPYwdgsa6f2vCgL4CrA= github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 h1:dpHAa9c74HgAXkZ2WPd84q2cCiF76eluuSGRw7bk7To= github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440/go.mod h1:NlZ117o///yWDbRAbgYD7/Y44qce8z1Dj4caUsjunSY= github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= @@ -97,8 +92,6 @@ github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= -github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 h1:2vmb32OdDhjZf2ETGDlr9n8RYXx7c+jXPxMiPbwnA+8= -github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4/go.mod h1:2JQx4jDHmWrbABvpOayg/+OTU6ehN0IyK2EHzceXpJo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -109,9 +102,11 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= @@ -137,14 +132,12 @@ golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= -golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= +golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= +golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -156,34 +149,30 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= -golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= -golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= -golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -218,8 +207,8 @@ google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpAD google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/internal/configuration/configuration.go b/internal/configuration/configuration.go index f17ddc9..bd51ef5 100644 --- a/internal/configuration/configuration.go +++ b/internal/configuration/configuration.go @@ -14,9 +14,6 @@ __ _____ _ __ __ ___ _| |___ / _ __ \ \/ / __| '__/ _`+"`"+` \ \ /\ / / | |_ \| '__| > < (__| | | (_| |\ V V /| |___) | | /_/\_\___|_| \__,_| \_/\_/ |_|____/|_| - %s - - %s`).Bold(), + %s`).Bold(), aurora.BrightRed("v"+VERSION).Bold(), - aurora.BrightYellow("with <3 by Hueristiq Open Source").Italic(), ) diff --git a/pkg/parser/sitemap/sitemap.go b/pkg/parser/sitemap/sitemap.go new file mode 100644 index 0000000..7f97af2 --- /dev/null +++ b/pkg/parser/sitemap/sitemap.go @@ -0,0 +1,154 @@ +package sitemap + +import ( + "encoding/xml" + "errors" + "io" +) + +type entry struct { + Type EntryType + Location string `xml:"loc"` + LastModified string `xml:"lastmod,omitempy"` + ChangeFrequency EntryChangeFrequency `xml:"changefreq,omitempty"` + Priority float32 `xml:"priority,omitempty"` +} + +func (e *entry) GetType() EntryType { + return e.Type +} + +func (e *entry) GetLocation() string { + return e.Location +} + +func (e *entry) GetChangeFrequency() EntryChangeFrequency { + return e.ChangeFrequency +} + +func (e *entry) GetPriority() float32 { + return e.Priority +} + +type EntryType string + +func (t EntryType) String() (entryType string) { + entryType = string(t) + + return +} + +type EntryChangeFrequency string + +func (f EntryChangeFrequency) String() (entryChangeFrequency string) { + entryChangeFrequency = string(f) + + return +} + +type Consumer func(entry Entry) (err error) + +type elementParser func(*xml.Decoder, *xml.StartElement) error + +type Entry interface { + GetType() EntryType + GetLocation() string + GetChangeFrequency() EntryChangeFrequency + GetPriority() float32 +} + +const ( + EntryTypeSitemap EntryType = "sitemap" + EntryTypeURL EntryType = "url" + + EntryChangeFrequencyAlways EntryChangeFrequency = "always" + EntryChangeFrequencyHourly EntryChangeFrequency = "hourly" + EntryChangeFrequencyDaily EntryChangeFrequency = "daily" + EntryChangeFrequencyWeekly EntryChangeFrequency = "weekly" + EntryChangeFrequencyMonthly EntryChangeFrequency = "monthly" + EntryChangeFrequencyYearly EntryChangeFrequency = "yearly" + EntryChangeFrequencyNever EntryChangeFrequency = "never" +) + +func Parse(reader io.Reader, consumer Consumer) (err error) { + return parseLoop(reader, func(d *xml.Decoder, se *xml.StartElement) (err error) { + return entryParser(d, se, consumer) + }) +} + +func entryParser(decoder *xml.Decoder, se *xml.StartElement, consume Consumer) (err error) { + if se.Name.Local == "url" { + entry := newURLEntry() + + if err = decoder.DecodeElement(entry, se); err != nil { + return + } + + if err = consume(entry); err != nil { + return + } + } + + if se.Name.Local == "sitemap" { + entry := newSitemapEntry() + + if err = decoder.DecodeElement(entry, se); err != nil { + return + } + + if err = consume(entry); err != nil { + return + } + } + + return +} + +func newURLEntry() (instance *entry) { + instance = &entry{ + Type: EntryTypeURL, + ChangeFrequency: EntryChangeFrequencyAlways, + Priority: 0.5, + } + + return +} + +func newSitemapEntry() (instance *entry) { + instance = &entry{ + Type: EntryTypeSitemap, + } + + return +} + +func parseLoop(reader io.Reader, parser elementParser) (err error) { + decoder := xml.NewDecoder(reader) + + for { + var token xml.Token + + token, err = decoder.Token() + + if errors.Is(err, io.EOF) { + err = nil + + break + } + + if err != nil { + return + } + + se, ok := token.(xml.StartElement) + if !ok { + continue + } + + if err = parser(decoder, &se); err != nil { + return + } + } + + return +} diff --git a/pkg/xcrawl3r/output.go b/pkg/xcrawl3r/output.go deleted file mode 100644 index 5b12081..0000000 --- a/pkg/xcrawl3r/output.go +++ /dev/null @@ -1,6 +0,0 @@ -package xcrawl3r - -type URL struct { - Source string - Value string -} diff --git a/pkg/xcrawl3r/page_strategy.go b/pkg/xcrawl3r/page_strategy.go index a9bcb61..a190b99 100644 --- a/pkg/xcrawl3r/page_strategy.go +++ b/pkg/xcrawl3r/page_strategy.go @@ -6,15 +6,15 @@ import ( "strings" "github.com/gocolly/colly/v2" - "github.com/hueristiq/hqgourl" + hqgourl "github.com/hueristiq/hq-go-url" "github.com/hueristiq/xcrawl3r/pkg/browser" ) -func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) (URLsChannel chan URL) { - URLsChannel = make(chan URL) +func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) <-chan Result { + results := make(chan Result) go func() { - defer close(URLsChannel) + defer close(results) if crawler.Render { // If we're using a proxy send it to the chrome instance @@ -26,6 +26,7 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) (URLsChannel chan URL) // If renderJavascript, pass the response's body to the renderer and then replace the body for .OnHTML to handle. crawler.PageCollector.OnResponse(func(request *colly.Response) { html := browser.GetRenderedSource(request.Request.URL.String()) + request.Body = []byte(html) }) } @@ -49,16 +50,20 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) (URLsChannel chan URL) }) crawler.FileCollector.OnError(func(_ *colly.Response, err error) { - }) + result := Result{ + Type: ResultError, + Source: "page", + Error: err, + } - crawler.FileCollector.OnResponse(func(response *colly.Response) { + results <- result }) crawler.PageCollector.OnHTML("[href]", func(e *colly.HTMLElement) { relativeURL := e.Attr("href") absoluteURL := e.Request.AbsoluteURL(relativeURL) - parsedAbsoluteURL, err := hqgourl.Parse(absoluteURL) + parsedAbsoluteURL, err := up.Parse(absoluteURL) if err != nil { return } @@ -72,9 +77,23 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) (URLsChannel chan URL) return } - URLsChannel <- URL{Source: "page:href", Value: absoluteURL} + result := Result{ + Type: ResultURL, + Source: "page:href", + Value: absoluteURL, + } + + results <- result if err = e.Request.Visit(absoluteURL); err != nil { + result := Result{ + Type: ResultError, + Source: "page:href", + Error: err, + } + + results <- result + return } }) @@ -87,10 +106,24 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) (URLsChannel chan URL) return } - URLsChannel <- URL{Source: "page:src", Value: absoluteURL} + result := Result{ + Type: ResultURL, + Source: "page:src", + Value: absoluteURL, + } + + results <- result if match := crawler.FileURLsRegex.MatchString(absoluteURL); match { if err := crawler.FileCollector.Visit(absoluteURL); err != nil { + result := Result{ + Type: ResultError, + Source: "page:src", + Error: err, + } + + results <- result + return } @@ -98,6 +131,14 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) (URLsChannel chan URL) } if err := e.Request.Visit(absoluteURL); err != nil { + result := Result{ + Type: ResultError, + Source: "page:src", + Error: err, + } + + results <- result + return } }) @@ -141,15 +182,37 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) (URLsChannel chan URL) continue } - URLsChannel <- URL{Source: "file:" + ext, Value: fileURL} + result := Result{ + Type: ResultURL, + Source: "file:" + ext, + Value: fileURL, + } + + results <- result if err := crawler.PageCollector.Visit(fileURL); err != nil { + result := Result{ + Type: ResultError, + Source: "file:" + ext, + Error: err, + } + + results <- result + return } } }) if err := crawler.PageCollector.Visit(parsedURL.String()); err != nil { + result := Result{ + Type: ResultError, + Source: "page", + Error: err, + } + + results <- result + return } @@ -157,5 +220,5 @@ func (crawler *Crawler) pageCrawl(parsedURL *hqgourl.URL) (URLsChannel chan URL) crawler.FileCollector.Wait() }() - return + return results } diff --git a/pkg/xcrawl3r/result.go b/pkg/xcrawl3r/result.go new file mode 100644 index 0000000..64f846d --- /dev/null +++ b/pkg/xcrawl3r/result.go @@ -0,0 +1,19 @@ +package xcrawl3r + +// Result represents the outcome of an operation or request, including the type of result, +// the source of the data, the actual value retrieved (if applicable), and any error encountered. +type Result struct { + Type ResultType // Specifies the type of result (e.g., a URL or an error). + Source string // Indicates the source from which the result was obtained (e.g., a specific API or service). + Value string // Holds the value of the result, such as a URL or any other data returned from the operation. + Error error // Holds any error that occurred during the operation, or nil if no error occurred. +} + +// ResultType defines the type of result using an integer type. It can represent different +// kinds of outcomes from an operation, such as a URL or an error. +type ResultType int + +const ( + ResultURL ResultType = iota // Represents a successful result containing a URL. + ResultError // Represents a result where an error occurred during the operation. +) diff --git a/pkg/xcrawl3r/robots_strategy.go b/pkg/xcrawl3r/robots_strategy.go index 3fb4b4d..c51a40a 100644 --- a/pkg/xcrawl3r/robots_strategy.go +++ b/pkg/xcrawl3r/robots_strategy.go @@ -1,57 +1,109 @@ package xcrawl3r import ( + "errors" "fmt" "io" "regexp" "strings" - "github.com/hueristiq/hqgohttp" - "github.com/hueristiq/hqgohttp/status" - "github.com/hueristiq/hqgourl" + hqgohttp "github.com/hueristiq/hq-go-http" + "github.com/hueristiq/hq-go-http/status" + hqgourl "github.com/hueristiq/hq-go-url" ) -func (crawler *Crawler) robotsParsing(parsedURL *hqgourl.URL) (URLsChannel chan URL) { - URLsChannel = make(chan URL) +func (crawler *Crawler) robotsParsing(parsedURL *hqgourl.URL) <-chan Result { + results := make(chan Result) go func() { - defer close(URLsChannel) + defer close(results) robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host) res, err := hqgohttp.Get(robotsURL) if err != nil { + result := Result{ + Type: ResultError, + Source: "known:robots", + Error: err, + } + + results <- result + return } defer res.Body.Close() - if res.StatusCode == status.OK { - URLsChannel <- URL{Source: "known", Value: robotsURL} + if res.StatusCode != status.OK { + result := Result{ + Type: ResultError, + Source: "known:robots", + Error: errors.New("unexpected status code"), + } + + results <- result + + return + } + + result := Result{ + Type: ResultURL, + Source: "known:robots", + Value: robotsURL, + } + + results <- result - body, err := io.ReadAll(res.Body) - if err != nil { - return + body, err := io.ReadAll(res.Body) + if err != nil { + result := Result{ + Type: ResultError, + Source: "known:robots", + Error: err, } - lines := strings.Split(string(body), "\n") + results <- result - re := regexp.MustCompile(".*llow: ") + return + } + + lines := strings.Split(string(body), "\n") + + re := regexp.MustCompile(".*llow: ") + + for _, line := range lines { + if !strings.Contains(line, "llow: ") { + continue + } - for _, line := range lines { - if strings.Contains(line, "llow: ") { - rfURL := re.ReplaceAllString(line, "") - rfURL = fmt.Sprintf("%s://%s%s", parsedURL.Scheme, parsedURL.Host, rfURL) + rfURL := re.ReplaceAllString(line, "") - URLsChannel <- URL{Source: "robots", Value: rfURL} + rfURL = strings.ReplaceAll(rfURL, "*", "") + rfURL = strings.TrimPrefix(rfURL, "/") + rfURL = fmt.Sprintf("%s://%s/%s", parsedURL.Scheme, parsedURL.Host, rfURL) - if err = crawler.PageCollector.Visit(rfURL); err != nil { - continue - } + result := Result{ + Type: ResultURL, + Source: "robots", + Value: rfURL, + } + + results <- result + + if err = crawler.PageCollector.Visit(rfURL); err != nil { + result := Result{ + Type: ResultError, + Source: "robots", + Error: err, } + + results <- result + + continue } } }() - return + return results } diff --git a/pkg/xcrawl3r/sitemap_strategy.go b/pkg/xcrawl3r/sitemap_strategy.go index a92f367..2853ed7 100644 --- a/pkg/xcrawl3r/sitemap_strategy.go +++ b/pkg/xcrawl3r/sitemap_strategy.go @@ -2,16 +2,18 @@ package xcrawl3r import ( "fmt" + "net/http" - "github.com/hueristiq/hqgourl" - sitemap "github.com/oxffaa/gopher-parse-sitemap" + hqgohttp "github.com/hueristiq/hq-go-http" + hqgourl "github.com/hueristiq/hq-go-url" + sitemap "github.com/hueristiq/xcrawl3r/pkg/parser/sitemap" ) -func (crawler *Crawler) sitemapParsing(parsedURL *hqgourl.URL) (URLsChannel chan URL) { - URLsChannel = make(chan URL) +func (crawler *Crawler) sitemapParsing(parsedURL *hqgourl.URL) <-chan Result { + results := make(chan Result) go func() { - defer close(URLsChannel) + defer close(results) sitemapPaths := []string{ "/sitemap.xml", @@ -31,24 +33,74 @@ func (crawler *Crawler) sitemapParsing(parsedURL *hqgourl.URL) (URLsChannel chan for _, path := range sitemapPaths { sitemapURL := fmt.Sprintf("%s://%s%s", parsedURL.Scheme, parsedURL.Host, path) - err := sitemap.ParseFromSite(sitemapURL, func(entry sitemap.Entry) (err error) { - smURL := entry.GetLocation() - - URLsChannel <- URL{Source: "sitemap", Value: smURL} - - if err = crawler.PageCollector.Visit(smURL); err != nil { - return + if err := crawler.parseSitemap(sitemapURL, results); err != nil { + result := Result{ + Type: ResultError, + Source: "known:sitemap", + Error: err, } - return - }) - if err != nil { + results <- result + continue } - URLsChannel <- URL{Source: "known", Value: sitemapURL} + result := Result{ + Type: ResultURL, + Source: "known:sitemap", + Value: sitemapURL, + } + + results <- result } }() + return results +} + +func (crawler *Crawler) parseSitemap(URL string, results chan Result) (err error) { + var res *http.Response + + res, err = hqgohttp.Get(URL) + if err != nil { + return + } + + if err = sitemap.Parse(res.Body, func(entry sitemap.Entry) (err error) { + sitemapEntryURL := entry.GetLocation() + + result := Result{ + Type: ResultURL, + Source: "sitemap", + Value: sitemapEntryURL, + } + + results <- result + + if entry.GetType() == sitemap.EntryTypeSitemap { + return crawler.parseSitemap(sitemapEntryURL, results) + } + + if err = crawler.PageCollector.Visit(sitemapEntryURL); err != nil { + result := Result{ + Type: ResultError, + Source: "sitemap", + Error: err, + } + + results <- result + + err = nil + + return + } + + return + }); err != nil { + return + } + + res.Body.Close() + return } diff --git a/pkg/xcrawl3r/utils.go b/pkg/xcrawl3r/utils.go index 9c7890d..5a4bfd6 100644 --- a/pkg/xcrawl3r/utils.go +++ b/pkg/xcrawl3r/utils.go @@ -3,7 +3,7 @@ package xcrawl3r import ( "strings" - "github.com/hueristiq/hqgourl" + hqgourl "github.com/hueristiq/hq-go-url" ) func decode(source string) (decodedSource string) { @@ -50,15 +50,19 @@ func (crawler *Crawler) fixURL(parsedURL *hqgourl.URL, URL string) (fixedURL str } func (crawler *Crawler) IsInScope(URL string) (isInScope bool) { - parsedURL, err := hqgourl.Parse(URL) + parsedURL, err := up.Parse(URL) if err != nil { return } + if parsedURL.Domain == nil { + return + } + if crawler.IncludeSubdomains { - isInScope = parsedURL.Domain == crawler.Domain || strings.HasSuffix(parsedURL.Domain, "."+crawler.Domain) + isInScope = parsedURL.Domain.String() == crawler.Domain || strings.HasSuffix(parsedURL.Domain.String(), "."+crawler.Domain) } else { - isInScope = parsedURL.Domain == crawler.Domain || parsedURL.Domain == "www."+crawler.Domain + isInScope = parsedURL.Domain.String() == crawler.Domain || parsedURL.Domain.String() == "www."+crawler.Domain } return diff --git a/pkg/xcrawl3r/xcrawl3r.go b/pkg/xcrawl3r/xcrawl3r.go index 275ad11..0f7a344 100644 --- a/pkg/xcrawl3r/xcrawl3r.go +++ b/pkg/xcrawl3r/xcrawl3r.go @@ -14,83 +14,179 @@ import ( "github.com/gocolly/colly/v2/debug" "github.com/gocolly/colly/v2/extensions" "github.com/gocolly/colly/v2/proxy" - "github.com/hueristiq/hqgourl" + hqgourl "github.com/hueristiq/hq-go-url" "github.com/hueristiq/xcrawl3r/internal/configuration" ) -type Options struct { +type Crawler struct { Domain string IncludeSubdomains bool Seeds []string - Depth int - Headless bool - Headers []string - Proxies []string - Render bool - Timeout int // seconds + Headless bool + Headers []string + Proxies []string + Render bool + // Timeout int UserAgent string - Concurrency int - Delay int // seconds - MaxRandomDelay int // seconds - Parallelism int + // Concurrency int + Delay int + // MaxRandomDelay int + Parallelism int Debug bool + + FileURLsRegex *regexp.Regexp + + URLsNotToRequestRegex *regexp.Regexp + URLsRegex *regexp.Regexp + + PageCollector *colly.Collector + FileCollector *colly.Collector } -type Crawler struct { +func (crawler *Crawler) Crawl() (results chan Result) { + results = make(chan Result) + + go func() { + defer close(results) + + seedsChannel := make(chan string, crawler.Parallelism) + + go func() { + defer close(seedsChannel) + + for index := range crawler.Seeds { + seed := crawler.Seeds[index] + + seedsChannel <- seed + } + }() + + URLsWG := new(sync.WaitGroup) + + for range crawler.Parallelism { + URLsWG.Add(1) + + go func() { + defer URLsWG.Done() + + for seed := range seedsChannel { + parsedSeed, err := up.Parse(seed) + if err != nil { + continue + } + + seenURLs := &sync.Map{} + + wg := &sync.WaitGroup{} + + wg.Add(1) + + go func() { + defer wg.Done() + + for URL := range crawler.sitemapParsing(parsedSeed) { + _, loaded := seenURLs.LoadOrStore(URL.Value, struct{}{}) + if loaded { + continue + } + + results <- URL + } + }() + + wg.Add(1) + + go func() { + defer wg.Done() + + for URL := range crawler.robotsParsing(parsedSeed) { + _, loaded := seenURLs.LoadOrStore(URL, struct{}{}) + if loaded { + continue + } + + results <- URL + } + }() + + wg.Add(1) + + go func() { + defer wg.Done() + + for URL := range crawler.pageCrawl(parsedSeed) { + _, loaded := seenURLs.LoadOrStore(URL, struct{}{}) + if loaded { + continue + } + + results <- URL + } + }() + + wg.Wait() + } + }() + } + + URLsWG.Wait() + }() + + return +} + +type Configuration struct { + Depth int + Domain string IncludeSubdomains bool Seeds []string - Depth int Headless bool Headers []string Proxies []string Render bool - Timeout int + Timeout int // seconds UserAgent string Concurrency int - Delay int - MaxRandomDelay int + Delay int // seconds + MaxRandomDelay int // seconds Parallelism int Debug bool - - PageCollector *colly.Collector - FileURLsRegex *regexp.Regexp - FileCollector *colly.Collector - URLsNotToRequestRegex *regexp.Regexp - URLsRegex *regexp.Regexp } -var DefaultUserAgent = fmt.Sprintf("%s v%s (https://github.com/hueristiq/%s)", configuration.NAME, configuration.VERSION, configuration.NAME) +var ( + DefaultUserAgent = fmt.Sprintf("%s v%s (https://github.com/hueristiq/%s)", configuration.NAME, configuration.VERSION, configuration.NAME) + up = hqgourl.NewParser() +) -func New(options *Options) (crawler *Crawler, err error) { +func New(cfg *Configuration) (crawler *Crawler, err error) { crawler = &Crawler{ - Domain: options.Domain, - IncludeSubdomains: options.IncludeSubdomains, - Seeds: options.Seeds, - - Depth: options.Depth, - Headless: options.Headless, - Headers: options.Headers, - Proxies: options.Proxies, - Render: options.Render, - Timeout: options.Timeout, - UserAgent: options.UserAgent, - - Concurrency: options.Concurrency, - Delay: options.Delay, - MaxRandomDelay: options.MaxRandomDelay, - Parallelism: options.Parallelism, - - Debug: options.Debug, + Domain: cfg.Domain, + IncludeSubdomains: cfg.IncludeSubdomains, + Seeds: cfg.Seeds, + + Headless: cfg.Headless, + Headers: cfg.Headers, + Proxies: cfg.Proxies, + Render: cfg.Render, + // Timeout: cfg.Timeout, + UserAgent: cfg.UserAgent, + + // Concurrency: cfg.Concurrency, + Delay: cfg.Delay, + // MaxRandomDelay: cfg.MaxRandomDelay, + Parallelism: cfg.Parallelism, + + Debug: cfg.Debug, } - crawler.URLsRegex = regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`) //nolint:gocritic // Works fine! + crawler.URLsRegex = hqgourl.NewExtractor().CompileRegex() crawler.FileURLsRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`) @@ -99,27 +195,27 @@ func New(options *Options) (crawler *Crawler, err error) { crawler.PageCollector = colly.NewCollector( colly.Async(true), colly.IgnoreRobotsTxt(), - colly.MaxDepth(crawler.Depth), + colly.MaxDepth(cfg.Depth), colly.AllowedDomains(crawler.Domain, "www."+crawler.Domain), ) if crawler.IncludeSubdomains { - crawler.PageCollector.AllowedDomains = nil + crawler.PageCollector.AllowedDomains = []string{} - escapedDomain := regexp.QuoteMeta(crawler.Domain) - pattern := fmt.Sprintf(`https?://([a-z0-9.-]*\.)?%s(/[a-zA-Z0-9()/*\-+_~:,.?#=]*)?`, escapedDomain) + // pattern := fmt.Sprintf(`https?://([a-z0-9.-]*\.)?%s(/[a-zA-Z0-9()/*\-+_~:,.?#=]*)?`, regexp.QuoteMeta(crawler.Domain)) crawler.PageCollector.URLFilters = []*regexp.Regexp{ - regexp.MustCompile(pattern), + // regexp.MustCompile(pattern), + hqgourl.NewExtractor(hqgourl.ExtractorWithSchemePattern(`(?:https?)://`)).CompileRegex(), } } - crawler.PageCollector.SetRequestTimeout(time.Duration(crawler.Timeout) * time.Second) + crawler.PageCollector.SetRequestTimeout(time.Duration(cfg.Timeout) * time.Second) if err = crawler.PageCollector.Limit(&colly.LimitRule{ DomainGlob: "*", - Parallelism: crawler.Concurrency, - RandomDelay: time.Duration(crawler.MaxRandomDelay) * time.Second, + Parallelism: cfg.Concurrency, + RandomDelay: time.Duration(cfg.MaxRandomDelay) * time.Second, }); err != nil { return } @@ -168,13 +264,13 @@ func New(options *Options) (crawler *Crawler, err error) { HTTPTransport := &http.Transport{ DialContext: (&net.Dialer{ - Timeout: time.Duration(crawler.Timeout) * time.Second, - KeepAlive: time.Duration(crawler.Timeout) * time.Second, + Timeout: time.Duration(cfg.Timeout) * time.Second, + KeepAlive: time.Duration(cfg.Timeout) * time.Second, }).DialContext, MaxIdleConns: 100, // Golang default is 100 MaxConnsPerHost: 1000, - IdleConnTimeout: time.Duration(crawler.Timeout) * time.Second, - TLSHandshakeTimeout: time.Duration(crawler.Timeout) * time.Second, + IdleConnTimeout: time.Duration(cfg.Timeout) * time.Second, + TLSHandshakeTimeout: time.Duration(cfg.Timeout) * time.Second, TLSClientConfig: &tls.Config{ InsecureSkipVerify: true, Renegotiation: tls.RenegotiateOnceAsClient, @@ -188,17 +284,23 @@ func New(options *Options) (crawler *Crawler, err error) { var parsedLocation *hqgourl.URL - parsedLocation, err = hqgourl.Parse(nextLocation) + parsedLocation, err = up.Parse(nextLocation) if err != nil { - return err + return + } + + if parsedLocation.Domain == nil { + return } - if crawler.IncludeSubdomains && (parsedLocation.Domain == crawler.Domain || strings.HasSuffix(parsedLocation.Domain, "."+crawler.Domain)) { - return nil + fmt.Println(parsedLocation) + + if cfg.IncludeSubdomains && (parsedLocation.Domain.String() == cfg.Domain || strings.HasSuffix(parsedLocation.Domain.String(), "."+cfg.Domain)) { + return } - if parsedLocation.Domain == crawler.Domain || parsedLocation.Domain == "www."+crawler.Domain { - return nil + if parsedLocation.Domain.String() == cfg.Domain || parsedLocation.Domain.String() == "www."+cfg.Domain { + return } return http.ErrUseLastResponse @@ -211,9 +313,7 @@ func New(options *Options) (crawler *Crawler, err error) { // Proxies // NOTE: Must come AFTER .SetClient calls if len(crawler.Proxies) > 0 { - var ( - rrps colly.ProxyFunc - ) + var rrps colly.ProxyFunc rrps, err = proxy.RoundRobinProxySwitcher(crawler.Proxies...) if err != nil { @@ -231,94 +331,3 @@ func New(options *Options) (crawler *Crawler, err error) { return } - -func (crawler *Crawler) Crawl() (URLsChannel chan URL) { - URLsChannel = make(chan URL) - - go func() { - defer close(URLsChannel) - - seedsChannel := make(chan string, crawler.Parallelism) - - go func() { - defer close(seedsChannel) - - for index := range crawler.Seeds { - seed := crawler.Seeds[index] - - seedsChannel <- seed - } - }() - - URLsWG := new(sync.WaitGroup) - - for i := 0; i < crawler.Parallelism; i++ { - URLsWG.Add(1) - - go func() { - defer URLsWG.Done() - - for seed := range seedsChannel { - parsedSeed, err := hqgourl.Parse(seed) - if err != nil { - continue - } - - wg := &sync.WaitGroup{} - seen := &sync.Map{} - - wg.Add(1) - - go func() { - defer wg.Done() - - for URL := range crawler.sitemapParsing(parsedSeed) { - _, loaded := seen.LoadOrStore(URL.Value, struct{}{}) - if loaded { - continue - } - - URLsChannel <- URL - } - }() - - wg.Add(1) - - go func() { - defer wg.Done() - - for URL := range crawler.robotsParsing(parsedSeed) { - _, loaded := seen.LoadOrStore(URL, struct{}{}) - if loaded { - continue - } - - URLsChannel <- URL - } - }() - - wg.Add(1) - - go func() { - defer wg.Done() - - for URL := range crawler.pageCrawl(parsedSeed) { - _, loaded := seen.LoadOrStore(URL, struct{}{}) - if loaded { - continue - } - - URLsChannel <- URL - } - }() - - wg.Wait() - } - }() - } - - URLsWG.Wait() - }() - - return -} From e36df1bdd854d0700ea8b2852faea121a7391825 Mon Sep 17 00:00:00 2001 From: "Alex (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Sun, 20 Oct 2024 18:14:24 +0300 Subject: [PATCH 08/13] chore(*): - --- README.md | 4 ++-- go.mod | 12 ++++++------ go.sum | 24 ++++++++++++------------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 00a90ca..faf3af4 100644 --- a/README.md +++ b/README.md @@ -102,8 +102,8 @@ go install -v github.com/hueristiq/xcrawl3r/cmd/xcrawl3r@latest ```bash sudo mv xcrawl3r /usr/local/bin/ ``` - > [!NOTE] - > Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. + + Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xcrawl3r` to their `PATH`. > [!CAUTION] diff --git a/go.mod b/go.mod index 448675c..5771bd1 100644 --- a/go.mod +++ b/go.mod @@ -3,10 +3,10 @@ module github.com/hueristiq/xcrawl3r go 1.23.1 require ( - github.com/chromedp/chromedp v0.10.0 + github.com/chromedp/chromedp v0.11.0 github.com/gocolly/colly/v2 v2.1.0 - github.com/hueristiq/hq-go-http v0.0.0-20241013111525-65cf25fd7246 - github.com/hueristiq/hq-go-url v0.0.0-20241006190408-b9120b7b7d91 + github.com/hueristiq/hq-go-http v0.0.0-20241020113552-532feebd5687 + github.com/hueristiq/hq-go-url v0.0.0-20241020144539-a9e1f60005ea github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f github.com/logrusorgru/aurora/v3 v3.0.0 github.com/spf13/pflag v1.0.5 @@ -19,15 +19,15 @@ require ( github.com/antchfx/htmlquery v1.3.3 // indirect github.com/antchfx/xmlquery v1.4.2 // indirect github.com/antchfx/xpath v1.3.2 // indirect - github.com/chromedp/cdproto v0.0.0-20241003230502-a4a8f7c660df // indirect - github.com/chromedp/sysutil v1.0.0 // indirect + github.com/chromedp/cdproto v0.0.0-20241014181340-cb3a7a1d51d7 // indirect + github.com/chromedp/sysutil v1.1.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.4 // indirect - github.com/hueristiq/hq-go-retrier v0.0.0-20241006185156-b525dc69639c // indirect + github.com/hueristiq/hq-go-retrier v0.0.0-20241020110813-ef8a550b01d5 // indirect github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect diff --git a/go.sum b/go.sum index 264abd6..0839daa 100644 --- a/go.sum +++ b/go.sum @@ -20,13 +20,14 @@ github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNY github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U= github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= -github.com/chromedp/cdproto v0.0.0-20241003230502-a4a8f7c660df h1:cbtSn19AtqQha1cxmP2Qvgd3fFMz51AeAEKLJMyEUhc= github.com/chromedp/cdproto v0.0.0-20241003230502-a4a8f7c660df/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= -github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E= -github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE= -github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= +github.com/chromedp/cdproto v0.0.0-20241014181340-cb3a7a1d51d7 h1:VDBgUGgdCBw9lTKwp0KPExhnqmGfGVJQTER2MehoICk= +github.com/chromedp/cdproto v0.0.0-20241014181340-cb3a7a1d51d7/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/chromedp v0.11.0 h1:1PT6O4g39sBAFjlljIHTpxmCSk8meeYL6+R+oXH4bWA= +github.com/chromedp/chromedp v0.11.0/go.mod h1:jsD7OHrX0Qmskqb5Y4fn4jHnqquqW22rkMFgKbECsqg= github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= +github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM= +github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -69,12 +70,12 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/hueristiq/hq-go-http v0.0.0-20241013111525-65cf25fd7246 h1:7vtPVtFXzDdeHxQyNpn+GL/0389EtOD+M/Nxf4ByVkA= -github.com/hueristiq/hq-go-http v0.0.0-20241013111525-65cf25fd7246/go.mod h1:HNq1LSRIFndL7YzB17UVatJxsbWQFZkaaJvJv5AYWX0= -github.com/hueristiq/hq-go-retrier v0.0.0-20241006185156-b525dc69639c h1:5QeToyM/D2hQzXdlQ+PiF24xbNcHETMLF1ZRR+rYDYI= -github.com/hueristiq/hq-go-retrier v0.0.0-20241006185156-b525dc69639c/go.mod h1:YkxIHoJHsL0wmzQ3tc0qz4UTr9q9eCicUt5RvMV//xw= -github.com/hueristiq/hq-go-url v0.0.0-20241006190408-b9120b7b7d91 h1:CIHiYD3YJFlCDqZlXxD/zH6xnMcF3/XC7yGeR3DdLso= -github.com/hueristiq/hq-go-url v0.0.0-20241006190408-b9120b7b7d91/go.mod h1:oDQP/s9eYGtsTz7LrbCC9CKbq6TmfB9b7yAoPCIqqyQ= +github.com/hueristiq/hq-go-http v0.0.0-20241020113552-532feebd5687 h1:wbtQCCbsyYpI22jE6f7MH979yNpvMPy0vertuYq32p0= +github.com/hueristiq/hq-go-http v0.0.0-20241020113552-532feebd5687/go.mod h1:4cIeUJTM4gt2NgJ4jOZePenVWaY8337wB1pvsK6sYDs= +github.com/hueristiq/hq-go-retrier v0.0.0-20241020110813-ef8a550b01d5 h1:uSIqfeqkXZI/QciepvLVduqbU7Rq+jr+At0ENVjPIN4= +github.com/hueristiq/hq-go-retrier v0.0.0-20241020110813-ef8a550b01d5/go.mod h1:YkxIHoJHsL0wmzQ3tc0qz4UTr9q9eCicUt5RvMV//xw= +github.com/hueristiq/hq-go-url v0.0.0-20241020144539-a9e1f60005ea h1:aFUvZ+Bnae4Coo97oThYy6OmuIwEkrQNGesMzAidedc= +github.com/hueristiq/hq-go-url v0.0.0-20241020144539-a9e1f60005ea/go.mod h1:1q7KVF3MOodsQzUkWwDwqn62L0Yjj8nLDSqZF0oirgQ= github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f h1:JAgZOIJ+UbkENpRiOTlfg51CW0UNrUkgwLjUGiH+x9g= github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f/go.mod h1:S5J3E3Azva5+JKv67uc+Hh3XwLDvkVYDGjEaMTFrIqg= github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 h1:dpHAa9c74HgAXkZ2WPd84q2cCiF76eluuSGRw7bk7To= @@ -155,7 +156,6 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= From a18e9f352ee25082554d2a852093bd97e77dcae2 Mon Sep 17 00:00:00 2001 From: "Alex (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Sun, 20 Oct 2024 18:16:31 +0300 Subject: [PATCH 09/13] chore(*): - --- Makefile | 2 +- internal/configuration/configuration.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 3d009d8..6413300 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ SHELL = /bin/sh # Define the project name for easy reference. -PROJECT = "xcrawl3rr" +PROJECT = "xcrawl3r" # The default target that gets executed when the `make` command is run without arguments. # In this case, it will trigger the `go-build` target. diff --git a/internal/configuration/configuration.go b/internal/configuration/configuration.go index bd51ef5..63c60d8 100644 --- a/internal/configuration/configuration.go +++ b/internal/configuration/configuration.go @@ -3,8 +3,8 @@ package configuration import "github.com/logrusorgru/aurora/v3" const ( - NAME string = "xcrawl3r" - VERSION string = "0.1.0" + NAME = "xcrawl3r" + VERSION = "0.1.0" ) var BANNER = aurora.Sprintf( From 8879e1d002021e02e1274b871002aeeeac130ef3 Mon Sep 17 00:00:00 2001 From: "Alex (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Sun, 20 Oct 2024 18:25:59 +0300 Subject: [PATCH 10/13] docs(README): - --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index faf3af4..c0b66de 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# xcrawl3r +# X Crawler (`xcrawl3r`) ![made with go](https://img.shields.io/badge/made%20with-Go-1E90FF.svg) [![go report card](https://goreportcard.com/badge/github.com/hueristiq/xcrawl3r)](https://goreportcard.com/report/github.com/hueristiq/xcrawl3r) [![release](https://img.shields.io/github/release/hueristiq/xcrawl3r?style=flat&color=1E90FF)](https://github.com/hueristiq/xcrawl3r/releases) [![open issues](https://img.shields.io/github/issues-raw/hueristiq/xcrawl3r.svg?style=flat&color=1E90FF)](https://github.com/hueristiq/xcrawl3r/issues?q=is:issue+is:open) [![closed issues](https://img.shields.io/github/issues-closed-raw/hueristiq/xcrawl3r.svg?style=flat&color=1E90FF)](https://github.com/hueristiq/xcrawl3r/issues?q=is:issue+is:closed) [![license](https://img.shields.io/badge/license-MIT-gray.svg?color=1E90FF)](https://github.com/hueristiq/xcrawl3r/blob/master/LICENSE) ![maintenance](https://img.shields.io/badge/maintained%3F-yes-1E90FF.svg) [![contribution](https://img.shields.io/badge/contributions-welcome-1E90FF.svg)](https://github.com/hueristiq/xcrawl3r/blob/master/CONTRIBUTING.md) From c4b0adfbbf97dca04d63e22a970241256dcd6b35 Mon Sep 17 00:00:00 2001 From: "Alex (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Sun, 20 Oct 2024 21:03:23 +0300 Subject: [PATCH 11/13] ci(*): - --- .github/workflows/dockerhub-push.yaml | 2 +- .github/workflows/lint.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dockerhub-push.yaml b/.github/workflows/dockerhub-push.yaml index b8da853..9ffe4ed 100644 --- a/.github/workflows/dockerhub-push.yaml +++ b/.github/workflows/dockerhub-push.yaml @@ -1,4 +1,4 @@ -name: 🐋 DockerHub Push +name: 🐳 DockerHub Push on: workflow_run: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 7904b01..34d36e6 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,4 +1,4 @@ -name: 💅🏻 Lint +name: 💅 Lint on: push: From 02d781c49d24ed1270dea33fa4b235379431b864 Mon Sep 17 00:00:00 2001 From: "Alex (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Sun, 20 Oct 2024 23:29:53 +0300 Subject: [PATCH 12/13] docs(README): - --- README.md | 65 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index c0b66de..c72eac0 100644 --- a/README.md +++ b/README.md @@ -136,50 +136,51 @@ xcrawl3r -h Here's what the help message looks like: ```text - _ _____ -__ _____ _ __ __ ___ _| |___ / _ __ -\ \/ / __| '__/ _` \ \ /\ / / | |_ \| '__| - > < (__| | | (_| |\ V V /| |___) | | -/_/\_\___|_| \__,_| \_/\_/ |_|____/|_| v0.1.0 -A CLI utility to recursively crawl webpages. + _ _____ +__ _____ _ __ __ ___ _| |___ / _ __ +\ \/ / __| '__/ _` \ \ /\ / / | |_ \| '__| + > < (__| | | (_| |\ V V /| |___) | | +/_/\_\___|_| \__,_| \_/\_/ |_|____/|_| + v0.1.0 USAGE: xcrawl3r [OPTIONS] INPUT: - -d, --domain string domain to match URLs - --include-subdomains bool match subdomains' URLs - -s, --seeds string seed URLs file (use `-` to get from stdin) - -u, --url string URL to crawl + -d, --domain string domain to match URLs + --include-subdomains bool match subdomains' URLs + -s, --seeds string seed URLs file (use `-` to get from stdin) + -u, --url string URL to crawl CONFIGURATION: - --depth int maximum depth to crawl (default 3) - TIP: set it to `0` for infinite recursion - --headless bool If true the browser will be displayed while crawling. - -H, --headers string[] custom header to include in requests - e.g. -H 'Referer: http://example.com/' - TIP: use multiple flag to set multiple headers - --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080) - TIP: use multiple flag to set multiple proxies - --render bool utilize a headless chrome instance to render pages - --timeout int time to wait for request in seconds (default: 10) - --user-agent string User Agent to use (default: web) - TIP: use `web` for a random web user-agent, - `mobile` for a random mobile user-agent, - or you can set your specific user-agent. + --depth int maximum depth to crawl (default 3) + TIP: set it to `0` for infinite recursion + --headless bool If true the browser will be displayed while crawling. + -H, --headers string[] custom header to include in requests + e.g. -H 'Referer: http://example.com/' + TIP: use multiple flag to set multiple headers + --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080) + TIP: use multiple flag to set multiple proxies + --render bool utilize a headless chrome instance to render pages + --timeout int time to wait for request in seconds (default: 10) + --user-agent string User Agent to use (default: xcrawl3r v0.1.0 (https://github.com/hueristiq/xcrawl3r)) + TIP: use `web` for a random web user-agent, + `mobile` for a random mobile user-agent, + or you can set your specific user-agent. RATE LIMIT: - -c, --concurrency int number of concurrent fetchers to use (default 10) - --delay int delay between each request in seconds - --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s) - -p, --parallelism int number of concurrent URLs to process (default: 10) + -c, --concurrency int number of concurrent fetchers to use (default 10) + --delay int delay between each request in seconds + --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s) + -p, --parallelism int number of concurrent URLs to process (default: 10) OUTPUT: - --debug bool enable debug mode (default: false) - -m, --monochrome bool coloring: no colored output mode - -o, --output string output file to write found URLs - -v, --verbosity string debug, info, warning, error, fatal or silent (default: debug) + --debug bool enable debug mode (default: false) + -m, --monochrome bool coloring: no colored output mode + -o, --output string output file to write found URLs + --silent bool display output URLs only + -v, --verbose bool display verbose output ``` From 752023c1c10bea35092114fe365127a692330e23 Mon Sep 17 00:00:00 2001 From: "Alex (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Sun, 20 Oct 2024 23:31:17 +0300 Subject: [PATCH 13/13] chore(*): Bump up version to 0.2.0 --- README.md | 4 ++-- internal/configuration/configuration.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c72eac0..d0b3c68 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ __ _____ _ __ __ ___ _| |___ / _ __ \ \/ / __| '__/ _` \ \ /\ / / | |_ \| '__| > < (__| | | (_| |\ V V /| |___) | | /_/\_\___|_| \__,_| \_/\_/ |_|____/|_| - v0.1.0 + v0.2.0 USAGE: xcrawl3r [OPTIONS] @@ -164,7 +164,7 @@ CONFIGURATION: TIP: use multiple flag to set multiple proxies --render bool utilize a headless chrome instance to render pages --timeout int time to wait for request in seconds (default: 10) - --user-agent string User Agent to use (default: xcrawl3r v0.1.0 (https://github.com/hueristiq/xcrawl3r)) + --user-agent string User Agent to use (default: xcrawl3r v0.2.0 (https://github.com/hueristiq/xcrawl3r)) TIP: use `web` for a random web user-agent, `mobile` for a random mobile user-agent, or you can set your specific user-agent. diff --git a/internal/configuration/configuration.go b/internal/configuration/configuration.go index 63c60d8..ada4d5d 100644 --- a/internal/configuration/configuration.go +++ b/internal/configuration/configuration.go @@ -4,7 +4,7 @@ import "github.com/logrusorgru/aurora/v3" const ( NAME = "xcrawl3r" - VERSION = "0.1.0" + VERSION = "0.2.0" ) var BANNER = aurora.Sprintf(