Skip to content

Commit

Permalink
Sort JSON array before calculating hashes. (#23)
Browse files Browse the repository at this point in the history
  • Loading branch information
paul-m authored Jan 13, 2025
1 parent d17ec60 commit ac88808
Show file tree
Hide file tree
Showing 4 changed files with 256 additions and 20 deletions.
52 changes: 35 additions & 17 deletions src/ETL/Load/Load.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ public function __construct(

public function run($item): int
{

$state = $this->itemState($item);

if ($state == Harvester::HARVEST_LOAD_NEW_ITEM || $state == Harvester::HARVEST_LOAD_UPDATED_ITEM) {
Expand All @@ -35,38 +34,57 @@ public function run($item): int
$identifier = Util::getDatasetId($item);

$hash = Util::generateHash($item);
$object = (object) ['harvest_plan_id' => $this->harvestPlan->identifier, "hash" => $hash];
$object = (object) [
'harvest_plan_id' => $this->harvestPlan->identifier,
'hash' => $hash
];
$this->hashStorage->store(json_encode($object), $identifier);
}

return $state;
}

private function itemState($item): int
/**
* Determine what to do next for the item, based on hash comparison.
*
* @param $item
* The item we're dealing with, as an arbitrary data structure.
*
* @return int
* One of the various Harvester constants.
*
* @see \Harvest\Harvester
*/
protected function itemState($item): int
{
if (isset($item->identifier)) {
$identifier = Util::getDatasetId($item);

$json = $this->hashStorage->retrieve($identifier);

// Load the hash from storage, for comparison, if it exists.
$hash = null;
if (isset($json)) {
$data = json_decode($json);
$hash = $data->hash;
if ($hash_json = $this->hashStorage->retrieve(Util::getDatasetId($item))) {
$hash_object = json_decode($hash_json);
$hash = $hash_object->hash ?? null;
}

if (isset($hash)) {
$new_hash = Util::generateHash($item);
if ($hash == $new_hash) {
if ($hash) {
if ($hash === Util::generateHash($item)) {
// Hashes matched, so no change.
return Harvester::HARVEST_LOAD_UNCHANGED;
} else {
// Hashes don't match, so try again with the legacy hash
// generator. This might match if the hash was generated
// before we changed the hashing system.
if ($hash === Util::legacyGenerateHash($item)) {
return Harvester::HARVEST_LOAD_UNCHANGED;
}
// We do have a past hash record, but neither new nor
// legacy hash matched, so update the dataset.
return Harvester::HARVEST_LOAD_UPDATED_ITEM;
}
} else {
return Harvester::HARVEST_LOAD_NEW_ITEM;
}
} else {
throw new \Exception("Item does not have an identifier " . json_encode($item));
// There was no existing hash in storage, so this is a new
// item.
return Harvester::HARVEST_LOAD_NEW_ITEM;
}
throw new \Exception('Item does not have an identifier ' . json_encode($item));
}
}
59 changes: 58 additions & 1 deletion src/Util.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,46 @@

class Util
{

/**
* Generate a hash of a data structure.
*
* @param mixed $item
* The data structure to hash.
*
* @return string
* Hash of the serialized data structure.
*/
public static function generateHash($item): string
{
return hash('sha256', serialize($item));
// Encode as JSON and then decode as associative arrays.
// @todo Is this the most efficient way to convert an arbitrary
// data structure into an array?
$decoded = json_decode(
json_encode($item, JSON_THROW_ON_ERROR),
true
);
// Sort the array by keys.
static::recursiveKeySort($decoded);
return hash('sha256', serialize($decoded));
}

/**
* Legacy hash generation.
*
* We use this for a secondary comparison, if generateHash() says they
* don't match.
*
* @param mixed $item
* The data structure to hash.
*
* @return string
* Hash of the serialized data structure.
*/
public static function legacyGenerateHash($item): string
{
return hash('sha256', serialize($item));
}

public static function getDatasetId($dataset): string
{
Expand All @@ -24,4 +59,26 @@ public static function getDatasetId($dataset): string
}
return "{$id}";
}

/**
* Sort an array in place by key.
*
* Recursively applies ksort() to any value in the array that is an array.
*
* @param $array
* The array to be sorted.
* @param $flags
* Flags to pass along to ksort().
*
* @see \ksort()
*/
public static function recursiveKeySort(&$array, int $flags = SORT_REGULAR): void
{
foreach ($array as &$value) {
if (is_array($value)) {
static::recursiveKeySort($value);
}
}
ksort($array, $flags);
}
}
90 changes: 90 additions & 0 deletions test/ETL/Load/LoadTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
<?php

namespace HarvestTest\ETL\Load;

use Contracts\RetrieverInterface;
use Harvest\ETL\Load\Load;
use Harvest\Harvester;
use Harvest\Storage\StorageInterface;
use PHPUnit\Framework\TestCase;

/**
* @covers \Harvest\ETL\Load\Load
* @coversDefaultClass \Harvest\ETL\Load\Load
*
* @group harvest
*/
class LoadTest extends TestCase
{

public static function provideUnchangedHash(): array
{
return [
'legacy' => ['918dae0183618b5a4c0da7cd0ca82a61a95c7d03c4822c2f46e4c34be43f7f65'],
'not_legacy' => ['6474af9ec864832917578b9a0e1824315cb98b907e66824ea06e1ce193fe3c0a'],
];
}

/**
* @covers ::itemState
* @dataProvider provideUnchangedHash
*/
public function testLoadUnchanged(string $hash): void
{
$identifier = 'test_id';

// Mock up a hash storage to give us back a hash that will match,
// either as a legacy hash or a non-legacy one, depending on the
// data provider.
$hash_storage = $this->getMockBuilder(RetrieverInterface::class)
->onlyMethods(['retrieve'])
->getMockForAbstractClass();
$hash_storage->expects($this->once())
->method('retrieve')
->with($identifier)
// Mock up our hash object.
->willReturn(json_encode((object) [
'identifier' => $identifier,
'hash' => $hash,
]));

// Mock up a loader that has run() and itemState() so we can test. Pass
// along our mocked hash storage to the constructor.
$load = $this->getMockBuilder(Load::class)
->setConstructorArgs([
$this->createStub(StorageInterface::class),
$hash_storage,
$this->createStub(StorageInterface::class)
])
->getMockForAbstractClass();

// Finally assert that our data has a matching hash. You can make this
// test fail by changing the 'something' => 'else' data, resulting in
// a different hash.
$this->assertEquals(
Harvester::HARVEST_LOAD_UNCHANGED,
$load->run((object) [
'identifier' => $identifier,
'something' => 'else',
])
);
}

/**
* @covers ::itemState
*/
public function testItemStateException(): void
{
// The itemState() method should throw an exception if the item does
// not have an identifier. This should make its way back through run()
// to the caller.
$load = $this->getMockBuilder(Load::class)
->disableOriginalConstructor()
->getMockForAbstractClass();

$this->expectException(\Exception::class);
$this->expectExceptionMessage('Item does not have an identifier');

$load->run((object) ['no_identifier' => '']);
}
}
75 changes: 73 additions & 2 deletions test/UtilTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,83 @@
use PHPUnit\Framework\TestCase;
use Harvest\Util;

/**
* @covers \Harvest\Util
* @coversDefaultClass \Harvest\Util
*
* @group harvest
*/
class UtilTest extends TestCase
{
public function test(): void
{
$fake_dataset = "Not an object";
$this->expectExceptionMessage("The dataset " . json_encode($fake_dataset) . " is not an object.");
$fake_dataset = 'Not an object';
$this->expectExceptionMessage('The dataset ' . json_encode($fake_dataset) . ' is not an object.');
Util::getDatasetId($fake_dataset);
}

public static function provideJsonSame(): array
{
return [
'same_shallow_values' => [
(object) ['key_a' => 'value', 'key_b' => 'value'],
(object) ['key_b' => 'value', 'key_a'=> 'value'],
],
'same_deep_values' => [
['key_a' => 'value', 'key_b' => ['deep_1' => 'value1', 'deep_2' => 'value2']],
['key_b' => ['deep_2' => 'value2', 'deep_1' => 'value1'], 'key_a'=> 'value'],
],
];
}

/**
* @covers ::generateHash
* @dataProvider provideJsonSame
*/
public function testJsonHashSame($first, $second): void
{
$this->assertEquals(
Util::generateHash($first),
Util::generateHash($second)
);
}


public static function provideArrays(): array
{
return [
'fully_flipped' => [
[
'a' => [
'A' => '2',
'B' => '2',
],
'z' => [
'Y' => '1',
'Z' => '1',
],
],
[
'z' => [
'Z' => '1',
'Y' => '1',
],
'a' => [
'B' => '2',
'A' => '2',
],
],
],
];
}

/**
* @covers ::recursiveKeySort
* @dataProvider provideArrays
*/
public function testRecursiveKSort(array $expected_array, array $array): void
{
Util::recursiveKeySort($array);
$this->assertSame($expected_array, $array);
}
}

0 comments on commit ac88808

Please sign in to comment.