Skip to content

Commit

Permalink
lets keep logic seperate
Browse files Browse the repository at this point in the history
  • Loading branch information
Yucked committed Oct 15, 2023
1 parent e125d68 commit 0f24df2
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 27 deletions.
2 changes: 1 addition & 1 deletion Grimoire.Commons/Grimoire.Commons.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="AngleSharp" Version="1.0.4"/>
<PackageReference Include="AngleSharp" Version="1.0.5" />
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="7.0.4"/>
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="7.0.1"/>
</ItemGroup>
Expand Down
89 changes: 63 additions & 26 deletions Grimoire.Commons/HtmlParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,6 @@ public class HtmlParser(ILogger<HtmlParser> logger,
Configuration.Default.WithDefaultLoader()
);

public async Task<IDocument> ParseAsync(string url) {
var retries = 0;
IDocument document;
do {
var content = await GetContentAsync(url);
await using var stream = await content.ReadAsStreamAsync();
document = await _context.OpenAsync(x => x.Content(stream));
await document.WaitForReadyAsync();
if (document.All.Length == 3) {
retries++;
continue;
}

break;
} while (retries <= configuration.GetValue<int>("Http:Retries"));

return document;
}

public async Task<HttpContent> GetContentAsync(string url) {
try {
var requestMessage = new HttpRequestMessage {
Expand Down Expand Up @@ -69,15 +50,71 @@ public Task<IDocument> ParseHtmlAsync(string html) {

public async Task DownloadAsync(string url, string output) {
try {
var content = await GetContentAsync(url);
var fileName =
(content.Headers.ContentDisposition?.FileNameStar
?? url.Split('/')[^1]).Clean();
var requestMessage = new HttpRequestMessage {
Method = HttpMethod.Get,
RequestUri = new Uri(url),
Headers = {
{
"User-Agent", configuration.GetSection("Http:UserAgents").Get<string[]>().RandomItem()
}
}
};

await Task.Delay(Random.Shared.Next(configuration.GetValue<int>("Http:Delay")));
using var responseMessage = await httpClient.SendAsync(requestMessage);
if (!responseMessage.IsSuccessStatusCode) {
logger.LogError("{}\n{}", responseMessage.StatusCode, responseMessage.ReasonPhrase);
throw new Exception(responseMessage.ReasonPhrase);
}

var fileName = (responseMessage.Content.Headers.ContentDisposition?.FileNameStar
?? url.Split('/')[^1]).Clean();
await using var fs = new FileStream($"{output}/{fileName}", FileMode.CreateNew);
await content.CopyToAsync(fs);
await responseMessage.Content.CopyToAsync(fs);
}
catch (Exception exception) {
logger.LogError("Failed to download {}\n{}", url, exception);
}
}

public async Task<IDocument> ParseAsync(string url) {
try {
var retries = 0;
IDocument document;
do {
var requestMessage = new HttpRequestMessage {
Method = HttpMethod.Get,
RequestUri = new Uri(url),
Headers = {
{
"User-Agent", configuration.GetSection("Http:UserAgents").Get<string[]>().RandomItem()
}
}
};

await Task.Delay(Random.Shared.Next(configuration.GetValue<int>("Http:Delay")));
using var responseMessage = await httpClient.SendAsync(requestMessage);
if (!responseMessage.IsSuccessStatusCode) {
logger.LogError("{}\n{}", responseMessage.StatusCode, responseMessage.ReasonPhrase);
throw new Exception(responseMessage.ReasonPhrase);
}

await using var stream = await responseMessage.Content.ReadAsStreamAsync();
document = await _context.OpenAsync(x => x.Content(stream));
await document.WaitForReadyAsync();
if (document.All.Length == 3) {
retries++;
continue;
}

break;
} while (retries <= configuration.GetValue<int>("Http:Retries"));

return document;
}
catch {
logger.LogError("Failed to download {}", url);
catch (Exception exception) {
logger.LogError("Failed to get {}\n{}", url, exception);
throw;
}
}
}

0 comments on commit 0f24df2

Please sign in to comment.