diff --git a/Cargo.lock b/Cargo.lock index c7338a2..9cdb2eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -549,6 +549,7 @@ dependencies = [ "iana-time-zone", "js-sys", "num-traits", + "serde", "wasm-bindgen", "windows-targets 0.52.6", ] @@ -678,6 +679,16 @@ dependencies = [ "version_check", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.6" @@ -914,6 +925,9 @@ dependencies = [ "fastside-core", "fastside-shared", "getrandom", + "reqwest", + "serde", + "serde_json", "tokio", "tower-service", "wasm-bindgen-futures", @@ -972,6 +986,21 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1372,6 +1401,22 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.6" @@ -1474,7 +1519,7 @@ dependencies = [ "socket2 0.5.7", "widestring", "windows-sys 0.48.0", - "winreg 0.50.0", + "winreg", ] [[package]] @@ -1677,6 +1722,23 @@ dependencies = [ "version_check", ] +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "nom" version = "7.1.3" @@ -1727,6 +1789,50 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "openssl" +version = "0.10.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "ordered-multimap" version = "0.6.0" @@ -1866,6 +1972,12 @@ dependencies = [ "futures-io", ] +[[package]] +name = "pkg-config" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" + [[package]] name = "polling" version = "2.8.0" @@ -2062,12 +2174,13 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" dependencies = [ "base64 0.22.1", "bytes", + "encoding_rs", "futures-core", "futures-util", "h2", @@ -2076,11 +2189,13 @@ dependencies = [ "http-body-util", "hyper", "hyper-rustls", + "hyper-tls", "hyper-util", "ipnet", "js-sys", "log", "mime", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -2092,7 +2207,9 @@ dependencies = [ "serde_json", "serde_urlencoded", "sync_wrapper 1.0.1", + "system-configuration", "tokio", + "tokio-native-tls", "tokio-rustls", "tokio-socks", "tower-service", @@ -2101,7 +2218,7 @@ dependencies = [ "wasm-bindgen-futures", "web-sys", "webpki-roots", - "winreg 0.52.0", + "windows-registry", ] [[package]] @@ -2249,17 +2366,49 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "schannel" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "serde" -version = "1.0.204" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] @@ -2288,9 +2437,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.204" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", @@ -2299,9 +2448,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.122" +version = "1.0.132" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" dependencies = [ "itoa", "memchr", @@ -2439,9 +2588,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.72" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", @@ -2459,6 +2608,43 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] + +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" +dependencies = [ + "cfg-if", + "fastrand 2.1.0", + "once_cell", + "rustix 0.38.34", + "windows-sys 0.59.0", +] [[package]] name = "termcolor" @@ -2573,6 +2759,16 @@ dependencies = [ "syn", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.26.0" @@ -2811,6 +3007,12 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -2983,6 +3185,36 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result", + "windows-strings", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -3001,6 +3233,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -3141,16 +3382,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "worker" version = "0.4.2" diff --git a/fastside-cloudflare/Cargo.toml b/fastside-cloudflare/Cargo.toml index 06133b0..8f95b70 100644 --- a/fastside-cloudflare/Cargo.toml +++ b/fastside-cloudflare/Cargo.toml @@ -45,6 +45,9 @@ tower-service = "0.3.3" console_error_panic_hook = "0.1.7" getrandom = { version = "0.2.15", features = ["js"] } tokio = { version = "1.41.1", default-features = false, features = ["sync"] } +serde = "1.0.215" +serde_json = "1.0.132" +reqwest = "0.12.9" # [features] # default = [] diff --git a/fastside-cloudflare/src/lib.rs b/fastside-cloudflare/src/lib.rs index 4c4a92e..d90e112 100644 --- a/fastside-cloudflare/src/lib.rs +++ b/fastside-cloudflare/src/lib.rs @@ -2,27 +2,57 @@ use std::{collections::HashMap, sync::Arc}; use axum::Router; use fastside_core::{ - crawler::Crawler, + crawler::{CrawledData, Crawler}, routes::main_router, types::{AppState, LoadedData}, }; -use fastside_shared::config::{AppConfig, UserConfig}; +use fastside_shared::{ + config::AppConfig, + serde_types::{ServicesData, StoredData}, +}; use tokio::sync::RwLock; use tower_service::Service; use worker::*; -fn router() -> Router { - let config = Arc::new(AppConfig::default()); - let loaded_data = Arc::new(RwLock::new(LoadedData { - services: HashMap::new(), - proxies: HashMap::new(), - default_user_config: UserConfig::default(), - })); +fn load_config(env: &Env) -> AppConfig { + let config_str = env + .var("config") + .expect("config variable is not set") + .to_string(); + let config: AppConfig = serde_json::from_str(&config_str).expect("failed to parse config"); + config +} + +async fn router(env: &Env) -> Router { + let config = Arc::new(load_config(&env)); + let loaded_data: Arc> = Arc::new(RwLock::new( + serde_json::from_str( + &env.kv("fastside") + .expect("failed to get kv") + .get("loaded_data") + .text() + .await + .expect("failed to get loaded_data from kv") + .expect("loaded_data not found"), + ) + .expect("failed to parse loaded_data"), + )); + let crawled_data: CrawledData = serde_json::from_str( + &env.kv("fastside") + .expect("failed to get kv") + .get("crawled_data") + .text() + .await + .expect("failed to get crawled_data from kv") + .expect("crawled_data not found"), + ) + .expect("failed to parse data"); let shared_state = Arc::new(AppState { config: config.clone(), - crawler: Arc::new(Crawler::new( + crawler: Arc::new(Crawler::with_data( loaded_data.clone(), config.clone().crawler.clone(), + crawled_data, )), loaded_data: loaded_data.clone(), regexes: HashMap::new(), @@ -35,10 +65,66 @@ fn router() -> Router { #[event(fetch)] async fn fetch( req: HttpRequest, - _env: Env, + env: Env, _ctx: Context, ) -> Result> { console_error_panic_hook::set_once(); - Ok(router().call(req).await?) + Ok(router(&env).await.call(req).await?) +} + +#[event(scheduled)] +async fn scheduled(_event: ScheduledEvent, env: Env, _ctx: ScheduleContext) { + console_error_panic_hook::set_once(); + + let config = load_config(&env); + let services_url = env + .var("services_url") + .expect("services_url variable is not set") + .to_string(); + + let services_str = reqwest::get(services_url) + .await + .expect("request to services failed") + .text() + .await + .expect("failed to get services text"); + let stored_data: StoredData = + serde_json::from_str(&services_str).expect("failed to parse services"); + + let loaded_data = { + let services_data: ServicesData = stored_data + .services + .into_iter() + .map(|service| (service.name.clone(), service)) + .collect(); + let loaded_data = LoadedData { + services: services_data, + proxies: config.proxies.clone(), + default_user_config: config.default_user_config.clone(), + }; + env.kv("fastside") + .expect("failed to get kv") + .put( + "loaded_data", + serde_json::to_string(&loaded_data).expect("failed to serialize loaded_data"), + ) + .expect("failed to put loaded_data to kv (builder)") + .execute() + .await + .expect("failed to put loaded_data to kv (request)"); + Arc::new(RwLock::new(loaded_data)) + }; + + let crawler = Crawler::new(loaded_data, config.crawler.clone()); + crawler.crawl(None).await.expect("failed to crawl"); + + let data_str = serde_json::to_string(&*crawler.read().await).expect("failed to serialize data"); + env.kv("fastside") + .expect("failed to get kv") + .put("crawled_data", data_str) + .expect("failed to put crawled_data to kv (builder)") + .execute() + .await + .expect("failed to put crawled_data to kv (request)"); } diff --git a/fastside-cloudflare/wrangler.toml b/fastside-cloudflare/wrangler.toml index 77158b5..bdf6dcf 100644 --- a/fastside-cloudflare/wrangler.toml +++ b/fastside-cloudflare/wrangler.toml @@ -2,5 +2,16 @@ name = "fastside" main = "build/worker/shim.mjs" compatibility_date = "2023-03-22" +kv_namespaces = [ + { binding = "fastside", id = "989f6480799a4d7c8e2890ac01118fc7" } +] + [build] command = "cargo install worker-build --version 0.1.0 && worker-build --release" + +[vars] +services_url = "https://raw.githubusercontent.com/cofob/fastside/refs/heads/master/services.json" +config = '{"crawler":{"request_timeout":{"secs":1,"nanos":0},"max_concurrent_requests":200},"default_user_config":{"required_tags":["clearnet","https","ipv4"]}}' + +[observability] +enabled = true diff --git a/fastside-core/Cargo.toml b/fastside-core/Cargo.toml index e268b51..a471cdf 100644 --- a/fastside-core/Cargo.toml +++ b/fastside-core/Cargo.toml @@ -28,7 +28,7 @@ thiserror = "1.0.60" # tokio = { version = "1.37.0", default-features = false, features = ["sync"] } futures = "0.3.30" # async rand = "0.8.5" # random -chrono = "0.4.38" # datetime +chrono = { version = "0.4.38", features = ["serde"] } time = "0.3.36" # time offsets regex = "1.10.5" # regex base64 = "0.22.1" # base64 diff --git a/fastside-core/src/crawler.rs b/fastside-core/src/crawler.rs index c6e3c76..e833fbb 100644 --- a/fastside-core/src/crawler.rs +++ b/fastside-core/src/crawler.rs @@ -5,23 +5,38 @@ use std::{ }; use chrono::{DateTime, Utc}; -use reqwest::StatusCode; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::{ sync::{Mutex, MutexGuard, RwLock}, - time::sleep, + time::{sleep, timeout}, }; use url::Url; use fastside_shared::config::CrawlerConfig; +#[cfg(not(target_arch = "wasm32"))] +use fastside_shared::parallel::Parallelise; use fastside_shared::{ client_builder::build_client, - parallel::Parallelise, serde_types::{HttpCodeRanges, Instance, Service}, }; use crate::types::LoadedData; +fn utc_now() -> DateTime { + #[cfg(not(target_arch = "wasm32"))] + return Utc::now(); + #[cfg(target_arch = "wasm32")] + return DateTime::from_timestamp(0, 0).unwrap(); +} + +fn system_now() -> SystemTime { + #[cfg(not(target_arch = "wasm32"))] + return SystemTime::now(); + #[cfg(target_arch = "wasm32")] + return UNIX_EPOCH; +} + #[derive(Error, Debug)] pub enum CrawlerError { #[error("url error: `{0}`")] @@ -30,11 +45,11 @@ pub enum CrawlerError { RequestError(#[from] reqwest::Error), } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub enum CrawledInstanceStatus { Ok(Duration), #[allow(dead_code)] - InvalidStatusCode(StatusCode, Duration), + InvalidStatusCode(String, Duration), StringNotFound, ConnectionError, RedirectPolicyError, @@ -62,14 +77,14 @@ impl std::fmt::Display for CrawledInstanceStatus { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct CrawledInstance { pub url: Url, pub status: CrawledInstanceStatus, pub tags: Vec, } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct CrawledService { pub name: String, pub instances: Vec, @@ -83,13 +98,13 @@ impl CrawledService { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct CrawledServices { pub services: HashMap, pub time: DateTime, } -#[derive(Debug)] +#[derive(Debug, Deserialize, Serialize)] pub enum CrawledData { CrawledServices(CrawledServices), InitialLoading, @@ -141,7 +156,23 @@ impl Crawler { Self { loaded_data, config: Arc::new(config), - data: RwLock::new(CrawledData::InitialLoading), + data: RwLock::new(CrawledData::CrawledServices(CrawledServices { + services: HashMap::new(), + time: utc_now(), + })), + crawler_lock: Mutex::new(()), + } + } + + pub fn with_data( + loaded_data: Arc>, + config: CrawlerConfig, + data: CrawledData, + ) -> Self { + Self { + loaded_data, + config: Arc::new(config), + data: RwLock::new(data), crawler_lock: Mutex::new(()), } } @@ -165,11 +196,11 @@ impl Crawler { )?; let test_url = instance.url.join(&service.test_url)?; - let start = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); + let start = system_now().duration_since(UNIX_EPOCH).unwrap(); let response = client.get(test_url).send().await; let status = match response { Ok(response) => { - let end = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); + let end = system_now().duration_since(UNIX_EPOCH).unwrap(); let status_code = response.status().as_u16(); if service.allowed_http_codes.is_allowed(status_code) { if let Some(search_string) = &service.search_string { @@ -183,7 +214,10 @@ impl Crawler { CrawledInstanceStatus::Ok(end - start) } } else { - CrawledInstanceStatus::InvalidStatusCode(response.status(), end - start) + CrawledInstanceStatus::InvalidStatusCode( + response.status().to_string(), + end - start, + ) } } Err(e) => match e { @@ -211,7 +245,7 @@ impl Crawler { Ok(ret) } - async fn crawl<'a>( + pub async fn crawl<'a>( &self, crawler_guard: Option>, ) -> Result<(), CrawlerError> { @@ -242,7 +276,63 @@ impl Crawler { ) }) .collect(); - let mut parallelise = Parallelise::with_capacity(self.config.max_concurrent_requests); + + #[cfg(not(target_arch = "wasm32"))] + let mut results = { + let mut parallelise = Parallelise::with_capacity(self.config.max_concurrent_requests); + + for service in self.loaded_data.read().await.services.values() { + let service = Arc::new(service.clone()); + for instance in &service.instances { + let loaded_data = self.loaded_data.clone(); + let config = self.config.clone(); + let instance = instance.clone(); + parallelise + .push(tokio::spawn(Self::crawl_single_instance( + config, + loaded_data.clone(), + service.clone(), + instance, + ))) + .await; + } + } + + parallelise.wait().await + }; + + #[cfg(target_arch = "wasm32")] + let mut results = { + let mut out = Vec::new(); + + for service in self.loaded_data.read().await.services.values() { + let service = Arc::new(service.clone()); + for instance in &service.instances { + let loaded_data = self.loaded_data.clone(); + let config = self.config.clone(); + let instance = instance.clone(); + let inst = match timeout( + Duration::from_secs(1), + Self::crawl_single_instance( + config, + loaded_data.clone(), + service.clone(), + instance, + ), + ) + .await + { + Ok(r) => r, + Err(_) => { + continue; + } + }; + out.push(inst); + } + } + + out + }; for service in self.loaded_data.read().await.services.values() { let service = Arc::new(service.clone()); @@ -250,19 +340,18 @@ impl Crawler { let loaded_data = self.loaded_data.clone(); let config = self.config.clone(); let instance = instance.clone(); - parallelise - .push(tokio::spawn(Self::crawl_single_instance( + results.push( + Self::crawl_single_instance( config, loaded_data.clone(), service.clone(), instance, - ))) - .await; + ) + .await, + ); } } - let results = parallelise.wait().await; - for result in results { let (crawled_instance, name) = match result { Ok(c) => c, @@ -281,7 +370,7 @@ impl Crawler { let mut data = self.data.write().await; data.replace(CrawledData::CrawledServices(CrawledServices { services: crawled_services, - time: Utc::now(), + time: utc_now(), })); match data.as_ref() { diff --git a/fastside-core/src/lib.rs b/fastside-core/src/lib.rs index 96c1528..008fbfb 100644 --- a/fastside-core/src/lib.rs +++ b/fastside-core/src/lib.rs @@ -1,9 +1,4 @@ -#[cfg(not(target_arch = "wasm32"))] pub mod crawler; -#[cfg(target_arch = "wasm32")] -pub mod stub_crawler; -#[cfg(target_arch = "wasm32")] -pub use stub_crawler as crawler; pub mod errors; pub mod filters; pub mod routes; diff --git a/fastside-core/src/routes/index.rs b/fastside-core/src/routes/index.rs index 0c217de..18a186a 100644 --- a/fastside-core/src/routes/index.rs +++ b/fastside-core/src/routes/index.rs @@ -1,5 +1,6 @@ use askama::Template; use axum::{ + body::Bytes, extract::State, http::{header::CONTENT_TYPE, StatusCode}, response::{Html, IntoResponse}, @@ -14,7 +15,10 @@ use crate::{filters, types::AppState}; use fastside_shared::serde_types::ServicesData; pub fn router() -> Router> { - Router::new().route("/", get(index)) + Router::new() + .route("/", get(index)) + .route("/favicon.ico", get(favicon)) + .route("/robots.txt", get(robots_txt)) } /// The `IndexTemplate` structure renders the index page using the Askama template engine. @@ -55,3 +59,31 @@ pub async fn index(State(state): State>) -> impl IntoResponse { Err(_) => StatusCode::INTERNAL_SERVER_ERROR.into_response(), } } + +// Favicon as a static byte slice +const FAVICON: &[u8] = include_bytes!("../../static/favicon.ico"); + +// Handler for /favicon.ico +async fn favicon() -> impl IntoResponse { + ( + [ + ("Content-Type", "image/x-icon"), + ("Cache-Control", "public, max-age=3600"), + ], + Bytes::from_static(FAVICON), + ) +} + +// Robots.txt content as a static string +const ROBOTS_TXT: &str = "User-agent: *\nDisallow: /\n"; + +// Handler for /robots.txt +async fn robots_txt() -> impl IntoResponse { + ( + [ + ("Content-Type", "text/plain"), + ("Cache-Control", "public, max-age=3600"), + ], + ROBOTS_TXT, + ) +} diff --git a/fastside-core/src/stub_crawler.rs b/fastside-core/src/stub_crawler.rs deleted file mode 100644 index f0916b7..0000000 --- a/fastside-core/src/stub_crawler.rs +++ /dev/null @@ -1,296 +0,0 @@ -use std::{ - collections::HashMap, - sync::Arc, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; - -use chrono::{DateTime, Utc}; -use reqwest::StatusCode; -use thiserror::Error; -use tokio::{ - sync::{Mutex, MutexGuard, RwLock}, - time::sleep, -}; -use url::Url; - -use fastside_shared::config::CrawlerConfig; -use fastside_shared::{ - client_builder::build_client, - parallel::Parallelise, - serde_types::{HttpCodeRanges, Instance, Service}, -}; - -use crate::types::LoadedData; - -#[derive(Error, Debug)] -pub enum CrawlerError { - #[error("url error: `{0}`")] - UrlError(#[from] url::ParseError), - #[error("request error: `{0}`")] - RequestError(#[from] reqwest::Error), -} - -#[derive(Clone, Debug)] -pub enum CrawledInstanceStatus { - Ok(Duration), - #[allow(dead_code)] - InvalidStatusCode(StatusCode, Duration), - StringNotFound, - ConnectionError, - RedirectPolicyError, - BuilderError, - RequestError, - BodyError, - DecodeError, - TimedOut, - Unknown, -} - -impl CrawledInstanceStatus { - /// Used for sorting values in index.html template. - pub fn as_isize(&self) -> isize { - match self { - Self::Ok(d) => d.as_millis() as isize, - _ => isize::MAX, - } - } -} - -impl std::fmt::Display for CrawledInstanceStatus { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self) - } -} - -#[derive(Clone, Debug)] -pub struct CrawledInstance { - pub url: Url, - pub status: CrawledInstanceStatus, - pub tags: Vec, -} - -#[derive(Clone, Debug)] -pub struct CrawledService { - pub name: String, - pub instances: Vec, -} - -impl CrawledService { - pub fn get_alive_instances(&self) -> impl Iterator { - self.instances - .iter() - .filter(|s| matches!(&s.status, CrawledInstanceStatus::Ok(_))) - } -} - -#[derive(Clone, Debug)] -pub struct CrawledServices { - pub services: HashMap, - pub time: DateTime, -} - -#[derive(Debug)] -pub enum CrawledData { - CrawledServices(CrawledServices), - InitialLoading, - ReloadingServices(CrawledServices), -} - -impl CrawledData { - pub fn get_services(&self) -> Option<&CrawledServices> { - match self { - Self::CrawledServices(s) => Some(s), - Self::InitialLoading => None, - Self::ReloadingServices(current) => Some(current), - } - } - - pub fn is_reloading(&self) -> bool { - matches!(self, Self::ReloadingServices { .. }) - } - - pub fn replace(&mut self, new: CrawledData) { - *self = new; - } - - pub fn make_reloading(&mut self) { - let current = match self { - Self::CrawledServices(s) => s.clone(), - _ => return, - }; - *self = Self::ReloadingServices(current); - } -} - -impl AsRef for CrawledData { - fn as_ref(&self) -> &CrawledData { - self - } -} - -#[derive(Debug)] -pub struct Crawler { - loaded_data: Arc>, - config: Arc, - data: RwLock, - crawler_lock: Mutex<()>, -} - -impl Crawler { - pub fn new(loaded_data: Arc>, config: CrawlerConfig) -> Self { - Self { - loaded_data, - config: Arc::new(config), - data: RwLock::new(CrawledData::InitialLoading), - crawler_lock: Mutex::new(()), - } - } - - #[inline] - pub async fn read(&self) -> tokio::sync::RwLockReadGuard { - self.data.read().await - } - - async fn crawl_single_instance( - config: Arc, - loaded_data: Arc>, - service: Arc, - instance: Instance, - ) -> Result<(CrawledInstance, String), CrawlerError> { - let client = build_client( - service.as_ref(), - config.as_ref(), - &loaded_data.read().await.proxies, - &instance, - )?; - - let test_url = instance.url.join(&service.test_url)?; - let start = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); - let response = client.get(test_url).send().await; - let status = match response { - Ok(response) => { - let end = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); - let status_code = response.status().as_u16(); - if service.allowed_http_codes.is_allowed(status_code) { - if let Some(search_string) = &service.search_string { - let body = response.text().await?; - if !body.contains(search_string) { - CrawledInstanceStatus::StringNotFound - } else { - CrawledInstanceStatus::Ok(end - start) - } - } else { - CrawledInstanceStatus::Ok(end - start) - } - } else { - CrawledInstanceStatus::InvalidStatusCode(response.status(), end - start) - } - } - Err(e) => match e { - _ if e.is_timeout() => CrawledInstanceStatus::TimedOut, - _ if e.is_builder() => CrawledInstanceStatus::BuilderError, - _ if e.is_redirect() => CrawledInstanceStatus::RedirectPolicyError, - _ if e.is_request() => CrawledInstanceStatus::RequestError, - _ if e.is_body() => CrawledInstanceStatus::BodyError, - _ if e.is_decode() => CrawledInstanceStatus::DecodeError, - #[cfg(not(target_arch = "wasm32"))] - _ if e.is_connect() => CrawledInstanceStatus::ConnectionError, - _ => CrawledInstanceStatus::Unknown, - }, - }; - - let ret = ( - CrawledInstance { - url: instance.url.clone(), - tags: instance.tags.clone(), - status, - }, - service.name.clone(), - ); - debug!("Crawled instance: {ret:?}"); - Ok(ret) - } - - async fn crawl<'a>( - &self, - crawler_guard: Option>, - ) -> Result<(), CrawlerError> { - let crawler_guard = match crawler_guard { - Some(guard) => guard, - None => { - let Ok(crawler_guard) = self.crawler_lock.try_lock() else { - warn!("Crawler lock is already acquired, skipping crawl"); - return Ok(()); - }; - crawler_guard - } - }; - - let mut crawled_services: HashMap = self - .loaded_data - .read() - .await - .services - .keys() - .map(|name| { - ( - name.clone(), - CrawledService { - name: name.clone(), - instances: Vec::new(), - }, - ) - }) - .collect(); - - for service in self.loaded_data.read().await.services.values() { - let service = Arc::new(service.clone()); - for instance in &service.instances { - let loaded_data = self.loaded_data.clone(); - let config = self.config.clone(); - let instance = instance.clone(); - } - } - - let mut data = self.data.write().await; - data.replace(CrawledData::CrawledServices(CrawledServices { - services: crawled_services, - time: Utc::now(), - })); - - match data.as_ref() { - CrawledData::ReloadingServices { .. } => { - info!("Finished reloading services"); - } - CrawledData::InitialLoading => { - info!("Finished initial crawl, we are ready to serve requests"); - } - CrawledData::CrawledServices(_) => { - debug!("Finished crawl"); - } - } - - drop(crawler_guard); - Ok(()) - } - - /// Run crawler instantly in update loaded_data mode. - pub async fn update_crawl(&self) -> Result<(), CrawlerError> { - let crawler_guard = self.crawler_lock.lock().await; - let mut data = self.data.write().await; - data.make_reloading(); - drop(data); - self.crawl(Some(crawler_guard)).await - } - - pub async fn crawler_loop(&self) { - loop { - debug!("Starting crawl"); - if let Err(e) = self.crawl(None).await { - error!("Error occured during crawl loop: {e}"); - }; - debug!("Next crawl will start in {:?}", self.config.ping_interval); - sleep(self.config.ping_interval).await; - } - } -} diff --git a/fastside-core/src/types.rs b/fastside-core/src/types.rs index 1a1a79d..054618a 100644 --- a/fastside-core/src/types.rs +++ b/fastside-core/src/types.rs @@ -4,6 +4,7 @@ use fastside_shared::{ config::{AppConfig, ProxyData, UserConfig}, serde_types::ServicesData, }; +use serde::{Deserialize, Serialize}; use tokio::sync::RwLock; use crate::crawler::Crawler; @@ -15,7 +16,7 @@ pub struct CompiledRegexSearch { pub type Regexes = HashMap>; -#[derive(Debug)] +#[derive(Debug, Deserialize, Serialize)] pub struct LoadedData { pub services: ServicesData, pub proxies: ProxyData,