From d040e7310bcaf962e23c64969ffe95c7582adb00 Mon Sep 17 00:00:00 2001 From: Stefan Strigler Date: Wed, 8 Jan 2025 16:47:06 +0100 Subject: [PATCH] mod_spam_filter: resolve http redirects so we're not fooled by url shorteners --- mod_spam_filter/src/mod_spam_filter.erl | 43 ++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/mod_spam_filter/src/mod_spam_filter.erl b/mod_spam_filter/src/mod_spam_filter.erl index 1542c219..54e850f7 100644 --- a/mod_spam_filter/src/mod_spam_filter.erl +++ b/mod_spam_filter/src/mod_spam_filter.erl @@ -62,6 +62,7 @@ -include_lib("xmpp/include/xmpp.hrl"). -define(COMMAND_TIMEOUT, timer:seconds(30)). +-define(HTTPC_TIMEOUT, timer:seconds(3)). -type url() :: binary(). -type filename() :: binary() | none. @@ -197,6 +198,9 @@ handle_call({check_body, URLs, JIDs, From}, _From, Result2 end, {reply, {spam_filter, Result}, State2}; +handle_call({resolve_redirects, URLs}, _From, State) -> + ResolvedURLs = do_resolve_redirects(URLs, []), + {reply, {spam_filter, ResolvedURLs}, State}; handle_call({reload_files, JIDsFile, URLsFile}, _From, State) -> {Result, State1} = reload_files(JIDsFile, URLsFile, State), {reply, {spam_filter, Result}, State1}; @@ -390,7 +394,7 @@ check_from(Host, From) -> -spec check_body(binary(), jid(), binary()) -> ham | spam. check_body(Host, From, Body) -> - case {extract_urls(Body), extract_jids(Body)} of + case {extract_urls(Host, Body), extract_jids(Body)} of {none, none} -> ?DEBUG("No JIDs/URLs found in message", []), ham; @@ -406,17 +410,48 @@ check_body(Host, From, Body) -> end end. --spec extract_urls(binary()) -> {urls, [url()]} | none. -extract_urls(Body) -> +-spec extract_urls(binary(), binary()) -> {urls, [url()]} | none. +extract_urls(Host, Body) -> RE = <<"https?://\\S+">>, Options = [global, {capture, all, binary}], case re:run(Body, RE, Options) of {match, Captured} when is_list(Captured) -> - {urls, lists:flatten(Captured)}; + Urls = resolve_redirects(Host, lists:flatten(Captured)), + {urls, Urls}; nomatch -> none end. +-spec resolve_redirects(binary(), [url()]) -> [url()]. +resolve_redirects(Host, URLs) -> + Proc = get_proc_name(Host), + try gen_server:call(Proc, {resolve_redirects, URLs}) of + {spam_filter, ResolvedURLs} -> + ResolvedURLs + catch exit:{timeout, _} -> + ?WARNING_MSG("Timeout while resolving redirect: ~p", [URLs]), + URLs + end. + +-spec do_resolve_redirects([url()], [url()]) -> [url()]. +do_resolve_redirects([], Result) -> Result; +do_resolve_redirects([URL | Rest], Acc) -> + case + httpc:request(get, {URL, [{"user-agent", "curl/8.7.1"}]}, + [{autoredirect, false}, {timeout, ?HTTPC_TIMEOUT}], []) + of + {ok, {{_, Moved, _}, Headers, _Body}} when Moved >= 300, Moved < 400 -> + Location = proplists:get_value("location", Headers), + case lists:member(Location, Acc) of + false -> + do_resolve_redirects([Location | Rest], [URL | Acc]); + true -> + do_resolve_redirects(Rest, [URL | Acc]) + end; + _Res -> + do_resolve_redirects(Rest, [URL | Acc]) + end. + -spec extract_jids(binary()) -> {jids, [ljid()]} | none. extract_jids(Body) -> RE = <<"\\S+@\\S+">>,