Skip to content

Instantly share code, notes, and snippets.

@harisenbon
Created September 16, 2015 05:43
Show Gist options
  • Select an option

  • Save harisenbon/563ef073455cf3f58c9e to your computer and use it in GitHub Desktop.

Select an option

Save harisenbon/563ef073455cf3f58c9e to your computer and use it in GitHub Desktop.
Detect names as Japanese or Western
<?php
/**
* Filter for determining of a csv of names is a Japanese name in romaji
* php name-filter.php ~/Desktop/names.csv`
*
* right now it returns the name and a 1 if it looks Japanese for easy verification, but it’s easy to switch that to the actual line of data
* right now it requires that both the first and last name look Japanese, so it fails on people like Yumiko who are Japanese with western last names
*
*/
if (count($argv) <= 1) {
die('You must specify a file.');
}
$file = $argv[1];
if (!file_exists($file)) {
die("The file '$file' does not exist.");
}
$fh = fopen($file, 'rb');
$header = null;
$validRomaji = [
'a', 'i', 'u', 'e', 'o',
'ka', 'ki', 'ku', 'ke', 'ko',
'kya', 'kyu', 'kyo',
'ga', 'gi', 'gu', 'ge', 'go',
'sa', 'shi', 'si', 'su', 'se', 'so',
'sha', 'sya', 'shu', 'syu', 'sho', 'syo',
'ja', 'ju', 'jo',
'za', 'ji', 'zi', 'zu', 'se', 'zo',
'ta', 'chi', 'ti', 'tsu', 'te', 'to',
'da', 'di', 'dhi', 'du', 'dzu', 'de', 'do',
'cha', 'chu', 'cho', 'tcha',
'na', 'ni', 'nu', 'ne', 'no',
'nya', 'nyu', 'nyo',
'ha', 'hi', 'fu', 'hu', 'he', 'ho',
'hyo', 'hyu', 'hyo',
'pya', 'pyu', 'pyo',
'bya', 'byu', 'byo',
'ba', 'bi', 'bu', 'be', 'bo',
'pa', 'pi', 'pu', 'pe', 'po',
'ma', 'mi', 'mu', 'me', 'mo',
'mya', 'myu', 'myo',
'ra', 'ri', 'ru', 're', 'ro',
'rya', 'ryo', 'ryu',
'ya', 'yu', 'yo',
'wa', 'wo',
'n',
];
$i = 0;
while (($row = fgetcsv($fh, 1000, ',')) !== false) {
if (!$header) {
$header = array_flip($row);
print_r($header);
continue;
}
$i++;
$givenName = $row[$header['Given name']];
$familyName = $row[$header['Family name']];
echo "Checking $givenName $familyName...";
echo isPlausibleJapaneseName($givenName, $validRomaji) && isPlausibleJapaneseName($familyName, $validRomaji);
echo "\n";
}
fclose($fh);
function isPlausibleJapaneseName($name, $validRomaji) {
$nameParts = str_split(strtolower(preg_replace('/[^a-zA-Z]+/', '', $name)));
$nameLength = count($nameParts);
for ($i=0; $i < $nameLength; $i++) {
$part = $nameParts[$i];
if (in_array($part, $validRomaji)) {
continue;
}
// check for sokuon. this is simplified and misses more complex usages,
// but those should be rare in names.
if (!empty($nameParts[$i + 1]) && $part == $nameParts[$i + 1]) {
$sokuon = $nameParts[$i + 1];
for ($j=1; $j <= 3; $j++) {
$sokuonOffset = $i + 1 + $j;
if (empty($nameParts[$sokuonOffset])) {
return false;
}
$sokuon .= $nameParts[$sokuonOffset];
if (in_array($sokuon, $validRomaji)) {
$i += $sokuonOffset;
continue 2;
}
}
}
// try the next three characters (to allow for chi, shi, tcha, and other longer mora).
// We need to start at 1 so that we're looking at the next letter.
for ($j=1; $j <= 3; $j++) {
if (empty($nameParts[$i + $j])) {
return false;
}
$part .= $nameParts[$i + $j];
if (in_array($part, $validRomaji)) {
continue 2;
}
}
return false;
}
return true;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment