Skip to content

Commit

Permalink
Construct a name from IA metadata.
Browse files Browse the repository at this point in the history
This makes the Commons name an optional field. If not given,
a name is constructed from the IA metadata, if possible.

It can still be entered via the query parameter on /fill, so
it's possible to preload a custom name (e.g. via a template on-wiki)
  • Loading branch information
inductiveload committed Jul 15, 2021
1 parent 61ed200 commit 293260e
Show file tree
Hide file tree
Showing 4 changed files with 288 additions and 23 deletions.
15 changes: 6 additions & 9 deletions src/Controller/UploadController.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use Wikisource\IaUpload\ApiClient\IaClient;
use Wikisource\IaUpload\OAuth\MediaWikiOAuth;
use Wikisource\IaUpload\OAuth\Token\ConsumerToken;
use Wikisource\IaUpload\Utils\IaMetadataUtils;

/**
* Controller for the commons upload process
Expand Down Expand Up @@ -170,7 +171,6 @@ public function init( Request $request, Response $response ) {
$query = $request->getQueryParams();
return $this->outputsInitTemplate( [
'iaId' => $query['iaId'] ?? '',
'commonsName' => $query['commonsName'] ?? '',
'jobs' => $jobs,
], $response );
}
Expand Down Expand Up @@ -210,11 +210,10 @@ public function fill( Request $request, Response $response ) {
$format = $query['format'] ?? 'djvu';
$fileSource = $query['fileSource'] ?? 'djvu';
// Validate inputs.
if ( $iaId === '' || $commonsName === '' ) {
if ( $iaId === '' ) {
return $this->outputsInitTemplate( [
'iaId' => $iaId,
'format' => $format,
'commonsName' => $commonsName,
'error' => $this->i18n->message( 'set-all-fields' ),
], $response );
}
Expand All @@ -223,7 +222,6 @@ public function fill( Request $request, Response $response ) {
return $this->outputsInitTemplate( [
'iaId' => $iaId,
'format' => $format,
'commonsName' => $commonsName,
'error' => $this->i18n->message( 'invalid-length', [ $commonsName ] ),
], $response );
}
Expand All @@ -234,7 +232,6 @@ public function fill( Request $request, Response $response ) {
return $this->outputsInitTemplate( [
'iaId' => $iaId,
'format' => $format,
'commonsName' => $commonsName,
'error' => $this->i18n->message( 'invalid-commons-name', [ $commonsName ] ),
], $response );
}
Expand All @@ -248,12 +245,15 @@ public function fill( Request $request, Response $response ) {
return $this->outputsInitTemplate( [
'iaId' => $iaId,
'format' => $format,
'commonsName' => $commonsName,
'error' => $this->i18n->message( 'no-found-on-ia', [ $link ] ),
], $response );
}
$iaId = $iaData['metadata']['identifier'][0];

if ( $commonsName == '' ) {
$commonsName = IaMetadataUtils::getCommonsNameFromIaData( $iaData, ' - ' );
}

// Make sure at least one of the required input formats is available.
$djvuFilename = $this->getIaFileName( $iaData, 'djvu' );
$pdfFilename = $this->getIaFileName( $iaData, 'pdf' );
Expand All @@ -263,7 +263,6 @@ public function fill( Request $request, Response $response ) {
return $this->outputsInitTemplate( [
'iaId' => $iaId,
'format' => $format,
'commonsName' => $commonsName,
'error' => $this->i18n->message( 'no-usable-files-found' ),
], $response );
}
Expand Down Expand Up @@ -296,7 +295,6 @@ public function fill( Request $request, Response $response ) {
return $this->outputsInitTemplate( [
'iaId' => $iaId,
'format' => $format,
'commonsName' => $commonsName,
'error' => $this->i18n->message( 'already-on-commons', [ $link ] ),
], $response );
}
Expand Down Expand Up @@ -479,7 +477,6 @@ public function downloadDjvu( Request $request, Response $response, $iaId ) {
protected function outputsInitTemplate( array $params, Response $response ) {
$defaultParams = [
'iaId' => '',
'commonsName' => '',
'jobs' => [],
'format' => 'djvu',
'wiki_base_url' => $this->config['wiki_base_url'],
Expand Down
280 changes: 280 additions & 0 deletions src/Utils/IaMetadataUtils.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
<?php

namespace Wikisource\IaUpload\Utils;

/**
* General utilities for handling IA metadata
*
* @file
* @ingroup IaUpload
*
* @license GPL-2.0-or-later
*/
class IaMetadataUtils {

/**
* Sanitise a name from an IA creator metadata record
*
* This strips common junk out of the name, for example:
* * Bloggs, J. (Joe) -> Bloggs, Joe
* * Bloogs, Joe, 1901?-1945
* * Bloggs, Joe, b. 1901.
*
* @param string $author string directly from IA metadata
* @return string sanitised string
*/
public static function sanitiseName( string $author ): string {
$author = preg_replace( '/\s*\[from old catalog\]\s*/i', '', $author );

// strip out dates
$author = preg_replace( '/(?:, )?\(?(ca.\s*)?\d+\??-(\d+\??)?\)?,?/', '', $author );
// birth/death dates
$author = preg_replace( '/(?:, )?\(?\b(b|d)\. (\d+\??)\)?,?/', '', $author );

// strip out initial expansions like A. B. (Arthur Bradley)
// at the IA, these should be after a comma
$author = preg_replace( '/(?<=,\s).*?\((\w{2,}.*?)\)/', '$1', $author );

// remove ", author" suffixes
$author = preg_replace( '/, (author|editor|ed\.?|illust?(\.|rator)?|trans(|\.|lator)?)\b/i',
'', $author );
return $author;
}

/**
* Determine if a name found in an IA creator field looks human
*
* Certain words indicate that this is a non-human author
* (usually a library, GLAM institution or an organisational author
*
* @param string $name the candidate name
* @return bool true if the name isn't obviously non-human
*/
public static function nameLooksHuman( $name ) {
// sometimes a work can be "by" an institutions, but even then we don't
// need it in the title
//
// note: watch out for ranks like "secretary" and "general", which can
// be appended to real name
$bogus = [
// misc junk from IA metadata
'/unknown/',

// organisations
'/university/',
'/college/',
'/school/',
'/librar(ies|y)/',
'/institut(e|ion)/',
'/museum/',
'/collection/',
'/cent(re|er)/',
'/press/',
'/company/',
'/agenc(y|ies)/',
'/\b(inc|co|ltd|ag|gmbh|corp|dept)\.?\b/',
'/assoc(\.?\b|iation)/',
'/soci(al|ety|eties|edad|ete|été)/',
'/corporation/',
'/committee/',
'/\bboards?\b/',
'/department/',
'/division/',
'/council/',
'/na[ct]ional/',
'/\bpublic\b/',
'/office/',
'/bureau/',
'/exhibitions?/',
'/\bbody\b/',
'/\bunions?\b/',
'/research/',
'/service/',
'/program(me)?/',
'/project/',
'/counsel/',
'/organi[sz]ation/',
'/charit(y|ies)/',
'/commission/',
'/comisi[oó]n/',
'/affairs/',
'/district/',
'/senate/',
'/parliament/',
// "house" can be a name
'/house of/',
'/congress/',
// "church" can be a name
'/church (in|of)/',
'/\bregion/',
'/volume/',
'/state/',
// also confederate
'/federa(l|tion|te)/',
'/administration/',
'/ministr(y|ies)/',
'/official/',

// publications/events
'/conference/',
'/transactions/',
'/proceedings/',
'/server/',
'/publications/',
'/journal/',
'/periodical/',
'/serial/',
'/newspaper/',

// places
// note: some places like "England/English" can actually be valid names
// so we hope they occur with other banned words
'/american/',
'/u\.\s?s\.\s?a\./',
'/united/',
'/british/',
'/canad(a|ian)/',
'/japan/',
'/chin(a|ese)/',
'/ital(y|ia)/',
'/fran[çc]ais/',
// NESW by themselves can be names
'/(south|north)(east|west)/',

// subjects
'/auction/',
'/biblical/',
'/broadcast/',
'/education(|al)/',
'/histor(y|ia)/',
'/industr(y|ial|ies)/',
'/insurance/',
'/learning/',
'/patent/',
'/record/',
'/science/',
'/securit(y|ies)/',
'/socialis(t|m)/',
'/transport/',
];

foreach ( $bogus as $b ) {
if ( preg_match( $b, strtolower( $name ) ) === 1 ) {
return false;
}
}

return true;
}

/**
* Extract the surname from an author
*
* * Bloggs, Joe -> Bloggs
* * Joe Bloggs -> Bloggs
*
* This is a best-effort attempt and is not guaranteed for all names in
* the IA's highly variable metadata.
*
* @param string $author author string (ideally sanitised first)
* @return string|null what appears to be the surname
*/
public static function extractAuthorSurname( string $author ): ?string {
// first heuristic: if there's a comma, the first bit is the surname
if ( strpos( $author, ',' ) !== false ) {
return preg_replace( '/,.*$/', '', $author );
}

// otherwise, split the string on spaces and return the last one
// that doesn't contain a number
$words = array_reverse( explode( ' ', $author ) );

foreach ( $words as $word ) {
if ( preg_match( '/\d/', $word ) === 1 ) {
continue;
}
return $word;
}

// didn't find anything useful at all
return null;
}

/**
* Get a list of author surnames that are (hopefully) suitable for use in
* the file name,
*
* Names that are ill-formed or look non-human are dropped.
*
* @param array $iaAuthors array of author strings as found in the IA metadata
* @return array array of surnames for names that look human
*/
public static function extractAuthorSurnamesForTitle( array $iaAuthors ): array {
if ( !$iaAuthors ) {
return '';
}

$names = [];

foreach ( $iaAuthors as $author ) {
$name = self::sanitiseName( $author );

// filter out non-humans
if ( self::nameLooksHuman( $name ) ) {
$name = self::extractAuthorSurname( $name );
if ( $name ) {
$names[] = $name;
}
}
}
return $names;
}

/**
* Construct a best-effort title for Commons from the IA metadata
*
* Not strong guarantees are made about the suitability of the name, since
* the IA metadata is of extremely variable quality.
*
* @param array $iaData array of IA data, including at least 'metadata'
* @param string $sep the field separator
* @return string|null a filename (without extension) or null if none can be generated
*/
public static function getCommonsNameFromIaData( array $iaData, string $sep ): ?string {
$meta = $iaData['metadata'];
// var_dump( $meta );

if ( !$meta ) {
return null;
}

$name = $meta['title'][0];

// not much we can do if the title doesn't even exist
if ( !$name ) {
return null;
}

// subtitles are sometimes separated by ; or :, these are easy to trim
$name = preg_replace( '/\s*[:;].*$/', '', $name );

// keep the name reasonable
$name = substr( $name, 0, 200 );

$authorNames = self::extractAuthorSurnamesForTitle( $meta['creator'] );

if ( $authorNames ) {
$name .= $sep . implode( $authorNames, ', ' );
}

if ( $meta['date'][0] ) {
$year = preg_replace( '/-.*$/', '', $meta['date'][0] );

if ( $year ) {
$name .= $sep . $year;
}
}

return $name;
}
}
3 changes: 2 additions & 1 deletion views/commons/fill.twig
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
<label for="commonsName" class="col-lg-2 control-label">{{ 'form-label-commons-name'|message }}</label>
<div class="col-lg-10">
<span class="input-group">
<input name="commonsName" id="commonsName" type="text" size="30" required="required" readonly="readonly" class="form-control" value="{{ commonsName|e }}" />
<input name="commonsName" id="commonsName" type="text" size="30" required="required" class="form-control" value="{{ commonsName|e }}" />
<span class="input-group-addon">.{{ format }}</span>
</span>
<span class="help-block">{{ 'commons-name-help'|message }}</span>
</div>
</div>

Expand Down
13 changes: 0 additions & 13 deletions views/commons/init.twig
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,6 @@
</span>
</div>
</div>
<div class="form-group">
<label for="commonsName" class="col-lg-3 control-label">{{ 'form-label-commons-name'|message }}</label>
<div class="controls col-lg-9">
<div class="input-group">
<input name="commonsName" id="commonsName" type="text" size="30" required="required" class="form-control choose-format" value="{{ commonsName|e }}" maxlength="240" />
<select name="format" id="format" class="form-control input-group-addon">
<option value="pdf"{% if format == 'pdf' %} selected{% endif %}>.pdf</option>
<option value="djvu"{% if format == 'djvu' %} selected{% endif %}>.djvu</option>
</select>
</div>
<span class="help-block">{{ 'commons-name-help'|message }}</span>
</div>
</div>
<div class="form-group">
<div class="col-lg-offset-3 col-lg-9">
<input class="btn btn-primary" type="submit" value="{{ 'get-metadata'|message }}" />
Expand Down

0 comments on commit 293260e

Please sign in to comment.