Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test duplicates ignore municipality #1043

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@
using Schema.Search;
using DuplicateVerenigingDetection;
using Vereniging;
using Microsoft.Extensions.Logging.Abstractions;
using Nest;
using System.Collections.Immutable;

public class SearchDuplicateVerenigingDetectionService : IDuplicateVerenigingDetectionService
{
private readonly IElasticClient _client;
private readonly ILogger<SearchDuplicateVerenigingDetectionService> _logger;

public SearchDuplicateVerenigingDetectionService(IElasticClient client)
public SearchDuplicateVerenigingDetectionService(IElasticClient client, ILogger<SearchDuplicateVerenigingDetectionService> logger = null)
{
_client = client;
_logger = logger ?? NullLogger<SearchDuplicateVerenigingDetectionService>.Instance;
}

public async Task<IReadOnlyCollection<DuplicaatVereniging>> GetDuplicates(VerenigingsNaam naam, Locatie[] locaties,
Expand All @@ -33,19 +36,35 @@ await _client
s => s
.Explain(includeScore)
.TrackScores(includeScore)
//.MinScore(minimumScoreOverride.Value)
.Query(
q => q.Bool(
b => b.Must(
MatchOpNaam(naam),
IsNietGestopt,
IsNietDubbel
)
.MustNot(BeVerwijderd)
.Filter(MatchOpPostcodeOfGemeente(gemeentes, postcodes)
)
)
));
//.MinScore(minimumScoreOverride.Value)
.Query(
q => q.Bool(
b => b
.Should(
// Original must query
s1 => s1.Bool(

b => b.Must(
MatchOpNaam(naam)
)),
s2 => s2.Bool(
b => b.Must(
MatchOpFullNaam(naam))
))
.MinimumShouldMatch(1) // At least one of the clauses must match
.Filter(MatchOpPostcodeOfGemeente(gemeentes, postcodes),
IsNietGestopt,
IsNietDubbel,
IsNietVerwijderd)

)
));

_logger.LogInformation("Score for query: {Score}", string.Join(", ", searchResponse.Hits.Select(x => $"{x.Score} {x.Source.Naam}")));
searchResponse.Hits.ToList().ForEach(x =>
{
_logger.LogInformation("Query: {Query}Explanation for Score {Score} of '{Naam}': {@Explanation}", naam, x.Score, x.Source.Naam, x.Explanation);
});

return searchResponse.Hits
.Select(ToDuplicateVereniging)
Expand Down Expand Up @@ -81,13 +100,13 @@ private static QueryContainer IsNietDubbel(QueryContainerDescriptor<DuplicateDet
.Value(false));
}

private static QueryContainer BeVerwijderd(QueryContainerDescriptor<DuplicateDetectionDocument> shouldDescriptor)
private static QueryContainer IsNietVerwijderd(QueryContainerDescriptor<DuplicateDetectionDocument> shouldDescriptor)
{
return shouldDescriptor
.Term(termDescriptor
=> termDescriptor
.Field(document => document.IsVerwijderd)
.Value(true));
.Value(false));
}

private static Func<QueryContainerDescriptor<DuplicateDetectionDocument>, QueryContainer> MatchOpPostcode(string[] postcodes)
Expand Down Expand Up @@ -133,18 +152,26 @@ private static IEnumerable<Func<QueryContainerDescriptor<DuplicateDetectionDocum
}

private static Func<QueryContainerDescriptor<DuplicateDetectionDocument>, QueryContainer> MatchOpNaam(VerenigingsNaam naam)
{
return must => must
.Match(m => m
.Field(f => f.Naam)
.Query(naam)
.Analyzer(DuplicateDetectionDocumentMapping.DuplicateAnalyzer)
.Fuzziness(Fuzziness.AutoLength(2, 3))
.MinimumShouldMatch("3<75%"));
}

private static Func<QueryContainerDescriptor<DuplicateDetectionDocument>, QueryContainer> MatchOpFullNaam(VerenigingsNaam naam)
{
return must => must
.Match(m => m
.Field(f => f.Naam)
.Query(naam)
.Analyzer(DuplicateDetectionDocumentMapping
.DuplicateAnalyzer)
.Fuzziness(
Fuzziness
.Auto) // Assumes this analyzer applies lowercase and asciifolding
.MinimumShouldMatch("90%") // You can adjust this percentage as needed
);
.Field("naam.naamFull")
.Query(naam)//.ToString().Replace(" ", ""))
.Analyzer(DuplicateDetectionDocumentMapping.DuplicateFullNameAnalyzer)
.Fuzziness(Fuzziness.AutoLength(3,3))
.MinimumShouldMatch("75%")
); // You can adjust this percentage as needed);
}

private static DuplicaatVereniging ToDuplicateVereniging(IHit<DuplicateDetectionDocument> document)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,12 @@ private static AnalyzersDescriptor AddDuplicateDetectionAnalyzer(AnalyzersDescri
.Tokenizer("standard")
.CharFilters("underscore_replace", "dot_replace")
.Filters("lowercase", "asciifolding", "dutch_stop")
).Custom(DuplicateDetectionDocumentMapping.DuplicateFullNameAnalyzer,
selector: ca
=> ca
.Tokenizer("keyword")
.CharFilters("underscore_replace", "dot_replace")
.Filters("lowercase", "asciifolding", "dutch_stop")
);

private static NormalizersDescriptor AddVerenigingZoekNormalizer(NormalizersDescriptor ad)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ namespace AssociationRegistry.Admin.Schema.Search;
public static class DuplicateDetectionDocumentMapping
{
public const string DuplicateAnalyzer = "duplicate_analyzer";
public const string DuplicateFullNameAnalyzer = "duplicate_fullname_analyzer";

public static TypeMappingDescriptor<DuplicateDetectionDocument> Get(TypeMappingDescriptor<DuplicateDetectionDocument> map)
=> map
Expand All @@ -15,8 +16,16 @@ public static TypeMappingDescriptor<DuplicateDetectionDocument> Get(TypeMappingD
.Name(document => document.VCode))
.Text(
propertyDescriptor => propertyDescriptor
.Name(document => document.Naam)
.Analyzer(DuplicateAnalyzer))
.Name(document => document.Naam)
.Fields(fields => fields
.Text(subField => subField
.Name(x => x.Naam)
.Analyzer(DuplicateAnalyzer)
)
.Text(subField => subField
.Name("naamFull")
.Analyzer(DuplicateFullNameAnalyzer)
)))
.Text(propertyDescriptor => propertyDescriptor
.Name(document => document.KorteNaam)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
<EmbeddedResource Include="Commands\VerenigingMetRechtspersoonlijkheid\When_RegistreerVerenigingMetRechtspersoonlijkheid\files\request.with_kboNummer.json" />
<EmbeddedResource Include="Commands\VerenigingMetRechtspersoonlijkheid\When_RegistreerVerenigingMetRechtspersoonlijkheid\With_Kbo_Nummer_For_Supported_Rechtsvorm\files\request.with_kboNummer.json" />
<EmbeddedResource Include="Commands\VerenigingMetRechtspersoonlijkheid\When_RegistreerVerenigingMetRechtspersoonlijkheid\With_Kbo_Nummer_For_Unsupported_Organisaties\files\request.with_kboNummer.json" />
<EmbeddedResource Include="DuplicateDetection\Given_An_Extensive_DataSet\Seed\verwachte_dubbels.csv" />
<EmbeddedResource Include="DuplicateDetection\Given_An_Extensive_DataSet\Seed\verwachte_unieke.csv" />
<EmbeddedResource Include="FeitelijkeVereniging\When_RegistreerFeitelijkeVereniging\files\*.json" />
<EmbeddedResource Include="Framework\templates\DetailVerenigingResponse.json" />
<EmbeddedResource Include="Framework\templates\kboSyncHistoriek\KboSyncHistoriek.json" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
namespace AssociationRegistry.Test.Admin.Api.DuplicateDetection.Given_An_Extensive_DataSet;

using CsvHelper.Configuration.Attributes;

public record DuplicateDetectionSeedLine(
[property: Name("Naam")] string Naam,
[property: Name("TeRegistrerenNaam")] string TeRegistrerenNaam);
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
namespace AssociationRegistry.Test.Admin.Api.DuplicateDetection.Given_An_Extensive_DataSet;

using AssociationRegistry.Admin.Api.Adapters.DuplicateVerenigingDetectionService;
using AssociationRegistry.Admin.ProjectionHost.Infrastructure.ElasticSearch;
using AssociationRegistry.Admin.Schema.Search;
using AutoFixture;
using Common.AutoFixture;
using CsvHelper;
using CsvHelper.Configuration;
using FluentAssertions;
using Microsoft.Extensions.Logging.Abstractions;
using Nest;
using System.Collections.ObjectModel;
using System.Globalization;
using Vereniging;
using Xunit;
using AssociationRegistry.Admin.Api.Infrastructure.Extensions;
using AssociationRegistry.Hosts.Configuration.ConfigurationBindings;
using DuplicateVerenigingDetection;
using Microsoft.Extensions.Logging;
using Xunit.Abstractions;
using LogLevel = Nest.LogLevel;

public class DuplicateDetectionTest
{
private readonly Adres? _adres;
private readonly Fixture _fixture;
private readonly ElasticClient _elastic;
private readonly string _duplicateDetectionIndex;
private SearchDuplicateVerenigingDetectionService _duplicateVerenigingDetectionService;
public IReadOnlyCollection<DuplicateDetectionSeedLine> VerwachteDubbels { get; private set; }
public IReadOnlyCollection<DuplicateDetectionSeedLine> VerwachteUnieke { get; private set; }

public DuplicateDetectionTest(string duplicateDetectionIndex, ITestOutputHelper helper)
{
_fixture = new Fixture().CustomizeAdminApi();
_duplicateDetectionIndex = duplicateDetectionIndex;

_elastic = ElasticSearchExtensions.CreateElasticClient(new ElasticSearchOptionsSection()
{
Uri = "http://localhost:9200",
Username = "elastic",
Password = "local_development",
Indices = new ElasticSearchOptionsSection.IndicesOptionsSection()
{
DuplicateDetection = _duplicateDetectionIndex,
}
}, new TestOutputLogger(helper, duplicateDetectionIndex));

_adres = _fixture.Create<Adres>() with
{
Postcode = "8500",
Gemeente = Gemeentenaam.Hydrate("Kortrijk"),
};

InitializeAsync().GetAwaiter().GetResult();
}



public async Task InsertGeregistreerdeVerenigingen(IReadOnlyCollection<DuplicateDetectionSeedLine> readVerwachtDubbels)
{
var toRegisterDuplicateDetectionDocuments = readVerwachtDubbels.Select(x => new DuplicateDetectionDocument() with
{
Naam = x.Naam,
VerenigingsTypeCode = Verenigingstype.FeitelijkeVereniging.Code,
HoofdactiviteitVerenigingsloket = [],
Locaties = [_fixture.Create<DuplicateDetectionDocument.Locatie>() with{ Gemeente = _adres.Gemeente.Naam, Postcode = _adres.Postcode}]
});

foreach (var doc in toRegisterDuplicateDetectionDocuments)
{
await _elastic.IndexDocumentAsync(doc);
}

await _elastic.Indices.RefreshAsync(Indices.AllIndices);
}

public static IReadOnlyCollection<DuplicateDetectionSeedLine> ReadSeed(string associationregistryTestAdminApiDuplicatedetectionGivenAnExtensiveDatasetVerwachtdubbelsCsv)
=> ReadSeedFile(associationregistryTestAdminApiDuplicatedetectionGivenAnExtensiveDatasetVerwachtdubbelsCsv);

private static IReadOnlyCollection<DuplicateDetectionSeedLine> ReadSeedFile(string associationregistryTestAdminApiDuplicatedetectionGivenAnExtensiveDatasetVerwachtdubbelsCsv)
{
var resourceName = associationregistryTestAdminApiDuplicatedetectionGivenAnExtensiveDatasetVerwachtdubbelsCsv;
var assembly = typeof(Then_Some_Duplicates_Are_Expected).Assembly;
var stream = assembly.GetResource(resourceName);

using var streamReader = new StreamReader(stream);
using var csvReader = new CsvReader(streamReader, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ",",
HasHeaderRecord = true,
Quote = '"',
});

var records = csvReader.GetRecords<DuplicateDetectionSeedLine>()
.ToArray();

return new ReadOnlyCollection<DuplicateDetectionSeedLine>(records);
}

public async Task InitializeAsync()
{
if(_elastic.Indices.ExistsAsync(_duplicateDetectionIndex).GetAwaiter().GetResult().Exists)
_elastic.Indices.DeleteAsync(_duplicateDetectionIndex).GetAwaiter().GetResult();

_elastic.Indices.CreateDuplicateDetectionIndex(_duplicateDetectionIndex);

_duplicateVerenigingDetectionService = new SearchDuplicateVerenigingDetectionService(_elastic, NullLogger<SearchDuplicateVerenigingDetectionService>.Instance);

VerwachteDubbels = ReadSeed("AssociationRegistry.Test.Admin.Api.DuplicateDetection.Given_An_Extensive_DataSet.Seed.verwachte_dubbels.csv");
VerwachteUnieke = ReadSeed("AssociationRegistry.Test.Admin.Api.DuplicateDetection.Given_An_Extensive_DataSet.Seed.verwachte_unieke.csv");
await InsertGeregistreerdeVerenigingen(VerwachteDubbels);
}

public Task DisposeAsync()
=> Task.CompletedTask;

public async Task<IReadOnlyCollection<DuplicaatVereniging>> GetDuplicatesFor(string teRegistrerenNaam)
=> await _duplicateVerenigingDetectionService.GetDuplicates(VerenigingsNaam.Create(teRegistrerenNaam),
[
_fixture.Create<Locatie>() with
{
Adres = _adres,
},
]);
}

public class TestOutputLogger : ILogger
{
private readonly ITestOutputHelper _outputHelper;
private readonly string _categoryName;

public TestOutputLogger(ITestOutputHelper outputHelper, string categoryName)
{
_outputHelper = outputHelper ?? throw new ArgumentNullException(nameof(outputHelper));
_categoryName = categoryName;
}

public IDisposable BeginScope<TState>(TState state)
{
return null; // Scopes are not implemented
}

public bool IsEnabled(Microsoft.Extensions.Logging.LogLevel logLevel)
=> true;

public void Log<TState>(Microsoft.Extensions.Logging.LogLevel logLevel, EventId eventId, TState state, Exception? exception, Func<TState, Exception?, string> formatter)
{
if (!IsEnabled(logLevel))
{
return;
}

if (formatter == null)
{
throw new ArgumentNullException(nameof(formatter));
}

var message = formatter(state, exception);

if (!string.IsNullOrEmpty(message))
{
var logEntry = $"[{logLevel}] {_categoryName}: {message}";
_outputHelper.WriteLine(logEntry);
}

if (exception != null)
{
_outputHelper.WriteLine(exception.ToString());
}
if (!IsEnabled(logLevel))
{
return;
}

if (formatter == null)
{
throw new ArgumentNullException(nameof(formatter));
}

var msg = formatter(state, exception);

if (!string.IsNullOrEmpty(msg))
{
var logEntry = $"[{logLevel}] {_categoryName}: {msg}";
_outputHelper.WriteLine(logEntry);
}

if (exception != null)
{
_outputHelper.WriteLine(exception.ToString());
}
}
}

public class TestOutputLoggerProvider : ILoggerProvider
{
private readonly ITestOutputHelper _outputHelper;

public TestOutputLoggerProvider(ITestOutputHelper outputHelper)
{
_outputHelper = outputHelper;
}

public ILogger CreateLogger(string categoryName)
{
return new TestOutputLogger(_outputHelper, categoryName);
}

public void Dispose()
{
// No resources to dispose
}
}
Loading
Loading