View source
<?php
namespace Analyzer;
use Nametools\Normalize;
use Nametools\RegexCounter;
class ContributorNames implements AnalyzerInterface {
private $normalizer;
private $orgWords;
public function __construct(Normalize $normalizer = null) {
$this->normalizer = $normalizer ?: new Normalize(new ContributorObject(), new RegexCounter());
$this->normalizedNames = array();
$this->skippedNames = array();
$this
->setupPatterns();
}
public function setOrgWords($orgWords) {
$this->orgWords = array_map('strtolower', $orgWords);
}
public function analyze($string) {
$string = trim($string);
$string = str_replace(''', "'", $string);
$string = str_replace('Ö', 'Ö', $string);
if (strcasecmp("[anon]", $string) == 0) {
return false;
}
if ($this
->checkIsOrganization($string)) {
$co = new ContributorObject();
$co->organization = $string;
return $co;
}
if (strlen($string) < 6 && preg_match("/^[a-z0-9]+\$/i", $string)) {
$co = new ContributorObject();
$co->organization = $string;
return $co;
}
elseif (strlen($string) < 6 && !preg_match("/[ +?]/", $string)) {
return false;
}
list($lnPrefix, $string) = $this
->checkAndSetLastNamePrefix($string);
$object = $this->normalizer
->normalize($string);
if ($object) {
$object->lastNamePrefix = $lnPrefix;
$object = $this
->cleanUp($object);
}
return $object;
}
protected function checkIsOrganization($str) {
$str = strtolower($str);
foreach ($this->orgWords as $word) {
if (preg_match("/\\b{$word}\\b/i", $str)) {
return true;
}
}
return false;
}
protected function cleanUp(ContributorObject $obj) {
if ($obj->suffix) {
$obj->suffix = $obj
->mapSuffix($obj->suffix);
}
if (strlen($obj->middleName) == 1 && !$obj->middleInitial) {
$obj->middleInitial = $obj->middleName;
$obj->middleName = null;
}
if (strlen($obj->firstName) == 1 && !$obj->firstInitial) {
$obj->firstInitial = $obj->firstName;
$obj->firstName = null;
}
if ($obj->middleName && !$obj->middleInitial) {
$obj->middleInitial = $obj->middleName[0];
}
if ($obj->firstName && !$obj->firstInitial) {
$obj->firstInitial = $obj->firstName[0];
}
return $obj;
}
protected function checkAndSetLastNamePrefix($str) {
$regex = "/\\b(von|van der|van den|van de|van|le|el|dos|de|de la)\\s[\\p{L}]/i";
if (preg_match($regex, $str, $matches)) {
$prefix = $matches[1];
$str = preg_replace("/" . $matches[1] . "/", '', $str, 1);
while (strpos($str, ' ')) {
$str = str_replace(' ', ' ', $str);
}
}
else {
$prefix = null;
}
return array(
$prefix,
$str,
);
}
protected function setupPatterns() {
$ppLastNamePattern = "([\\p{L}\\p{Ll}-' ]+)";
$lastNamePattern = "([\\p{L}\\p{Ll}-']+)";
$suffixPattern = "(?i)(Jr|Sr|Esq|Ph\\.?D|2nd|3rd|Psy\\.D|M\\.S|II|III|IV)\\.?";
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([A-Z])\\.?[ +?]?([A-Z])\\.?[ +?]?([A-Z])\\.?,[ +?]?{$suffixPattern}/u", array(
'lastName',
'firstInitial',
'middleInitial',
'secondMiddleInitial',
'suffix',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([A-Z])\\.?[ +?]?([A-Z])\\.?,[ +?]?{$suffixPattern}/u", array(
'lastName',
'firstInitial',
'middleInitial',
'suffix',
));
$this->normalizer
->appendPattern("/{$ppLastNamePattern},[ +?]?([A-Z])\\.?(?![a-z'-]),[ +?]?{$suffixPattern}/u", array(
'lastName',
'firstInitial',
'suffix',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern} {$suffixPattern}?,[ +?]?([A-Z])\\.?[ +?]?([A-Z])\\.?[ +?]?([A-Z])\\.?\$/u", array(
'lastName',
'suffix',
'firstInitial',
'middleInitial',
'secondMiddleInitial',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern} {$suffixPattern}?,[ +?]?([A-Z])\\.?[ +?]?([A-Z])\\.?(?![a-z'-])\\.?\$/u", array(
'lastName',
'suffix',
'firstInitial',
'middleInitial',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern} {$suffixPattern}?,[ +?]?([A-Z])\\.?(?![a-z'-])\\.?\$/u", array(
'lastName',
'suffix',
'firstInitial',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([A-Z])\\.?[ +?]?([A-Z])\\.?[ +?]?([A-Z])\\.?\$/u", array(
'lastName',
'firstInitial',
'middleInitial',
'secondMiddleInitial',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([A-Z])\\.?[ +?]?([A-Z])\\.?(?![a-zA-Z'-])\\.?\$/u", array(
'lastName',
'firstInitial',
'middleInitial',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([A-Z])\\.?(?![a-z'-])\\.?\$/u", array(
'lastName',
'firstInitial',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([\\p{L}-]+)\\.?[ +?]([\\p{L}]+)\\.?[ +?]([A-Z])\\.?,[ +?]?{$suffixPattern}/ui", array(
'lastName',
'firstName',
'middleName',
'secondMiddleInitial',
'suffix',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([\\p{L}-]+)\\.?[ +?]([\\p{L}]+)\\.?,[ +?]?{$suffixPattern}/ui", array(
'lastName',
'firstName',
'middleName',
'suffix',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([\\p{L}-]+)\\.?,[ +?]?{$suffixPattern}/ui", array(
'lastName',
'firstName',
'suffix',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([\\p{L}-]+)\\.?[ +?]([\\p{L}]+)\\.?[ +?]([A-Z])\\.?/ui", array(
'lastName',
'firstName',
'middleName',
'secondMiddleInitial',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([\\p{L}-]+)\\.?[ +?]([\\p{L}]+)\\.?/ui", array(
'lastName',
'firstName',
'middleName',
));
$this->normalizer
->appendPattern("/^{$ppLastNamePattern},[ +?]?([\\p{L}-]+)\\.?/ui", array(
'lastName',
'firstName',
));
$this->normalizer
->appendPattern("/^([a-zA-Z])[\\. ]([a-zA-Z])[\\. ]([a-zA-Z])[\\.]? {$lastNamePattern}/u", array(
'firstInitial',
'middleInitial',
'secondMiddleInitial',
'lastName',
));
$this->normalizer
->appendPattern("/^([a-zA-Z])[\\. ]([a-zA-Z])[\\.]? {$lastNamePattern}/u", array(
'firstInitial',
'middleInitial',
'lastName',
));
$this->normalizer
->appendPattern("/^([\\p{L}-]+)\\.? ([A-Z])[\\.| ]?([A-Z])\\.? {$lastNamePattern},[ +?]?{$suffixPattern}/u", array(
'firstName',
'middleName',
'secondMiddleInitial',
'lastName',
'suffix',
));
$this->normalizer
->appendPattern("/^([\\p{L}-]+)\\.?[ +?]([a-z][a-z]+)[ +?]([A-Z])\\.?[ +?]{$lastNamePattern},[ +?]?{$suffixPattern}/ui", array(
'firstName',
'middleName',
'secondMiddleInitial',
'lastName',
'suffix',
));
$this->normalizer
->appendPattern("/^([\\p{L}-]+)\\.?[ +?]([a-z][a-z]+)[ +?]{$lastNamePattern},[ +?]?{$suffixPattern}/ui", array(
'firstName',
'middleName',
'lastName',
'suffix',
));
$this->normalizer
->appendPattern("/^([\\p{L}-]+)\\.?[ +?]{$lastNamePattern},[ +?]?{$suffixPattern}/ui", array(
'firstName',
'lastName',
'suffix',
));
$this->normalizer
->appendPattern("/^([\\p{L}-]+)\\.?[ +?]([A-Z])[\\.| ]?([A-Z])\\.?[ +?]{$lastNamePattern}/u", array(
'firstName',
'middleName',
'secondMiddleInitial',
'lastName',
));
$this->normalizer
->appendPattern("/^([\\p{L}-]+)\\.?[ +?]([a-z][a-z]+)[ +?]([A-Z])\\.?[ +?]{$lastNamePattern}/ui", array(
'firstName',
'middleName',
'secondMiddleInitial',
'lastName',
));
$this->normalizer
->appendPattern("/^([\\p{L}-]+)\\.?[ +?]([a-z][a-z]+)[ +?]{$lastNamePattern}/ui", array(
'firstName',
'middleName',
'lastName',
));
$this->normalizer
->appendPattern("/^([\\p{L}-]+)\\.?[ +?]{$lastNamePattern}/ui", array(
'firstName',
'lastName',
));
$this->normalizer
->appendPattern("/^{$suffixPattern},\\s?([\\p{L}-]+)\\.?[ +?]([a-z]+)\\.?[ +?]{$lastNamePattern}/ui", array(
'suffix',
'firstName',
'middleInitial',
'lastName',
));
}
}