harisenbon · September 16, 2015 05:43
diff --git a/name-filter.php b/name-filter.php
 <?php
 /**
 * Filter for determining of a csv of names is a Japanese name in romaji
 * php name-filter.php ~/Desktop/names.csv`
 * 
 * right now it returns the name and a 1 if it looks Japanese for easy verification, but it’s easy to switch that to the actual line of data
 * right now it requires that both the first and last name look Japanese, so it fails on people like Yumiko who are Japanese with western last names
 * 
 */

 if (count($argv) <= 1) {
 	die('You must specify a file.');
 }
 $file = $argv[1];

 if (!file_exists($file)) {
 	die("The file '$file' does not exist.");
 }

 $fh = fopen($file, 'rb');
 $header = null;
 $validRomaji = [
 	'a', 'i', 'u', 'e', 'o',
 	'ka', 'ki', 'ku', 'ke', 'ko',
 	'kya', 'kyu', 'kyo',
 	'ga', 'gi', 'gu', 'ge', 'go',
 	'sa', 'shi', 'si', 'su', 'se', 'so',
 	'sha', 'sya', 'shu', 'syu', 'sho', 'syo',
 	'ja', 'ju', 'jo',
 	'za', 'ji', 'zi', 'zu', 'se', 'zo',
 	'ta', 'chi', 'ti', 'tsu', 'te', 'to',
 	'da', 'di', 'dhi', 'du', 'dzu', 'de', 'do',
 	'cha', 'chu', 'cho', 'tcha',
 	'na', 'ni', 'nu', 'ne', 'no',
 	'nya', 'nyu', 'nyo',
 	'ha', 'hi', 'fu', 'hu', 'he', 'ho',
 	'hyo', 'hyu', 'hyo',
 	'pya', 'pyu', 'pyo',
 	'bya', 'byu', 'byo',
 	'ba', 'bi', 'bu', 'be', 'bo',
 	'pa', 'pi', 'pu', 'pe', 'po',
 	'ma', 'mi', 'mu', 'me', 'mo',
 	'mya', 'myu', 'myo',
 	'ra', 'ri', 'ru', 're', 'ro',
 	'rya', 'ryo', 'ryu',
 	'ya', 'yu', 'yo',
 	'wa', 'wo',
 	'n',
 ];

 $i = 0;
 while (($row = fgetcsv($fh, 1000, ',')) !== false) {
 	if (!$header) {
 		$header = array_flip($row);
 		print_r($header);
 		continue;
 	}
 	$i++;
 	$givenName = $row[$header['Given name']];
 	$familyName = $row[$header['Family name']];

 	echo "Checking $givenName $familyName...";
 	echo isPlausibleJapaneseName($givenName, $validRomaji) && isPlausibleJapaneseName($familyName, $validRomaji);
 	echo "\n";
 }

 fclose($fh);

 function isPlausibleJapaneseName($name, $validRomaji) {
 	$nameParts = str_split(strtolower(preg_replace('/[^a-zA-Z]+/', '', $name)));
 	$nameLength = count($nameParts);

 	for ($i=0; $i < $nameLength; $i++) {
 		$part = $nameParts[$i];

 		if (in_array($part, $validRomaji)) {
 			continue;
 		}

 		// check for sokuon. this is simplified and misses more complex usages,
 		// but those should be rare in names.
 		if (!empty($nameParts[$i + 1]) && $part == $nameParts[$i + 1]) {
 			$sokuon = $nameParts[$i + 1];

 			for ($j=1; $j <= 3; $j++) {
 				$sokuonOffset = $i + 1 + $j;
 				if (empty($nameParts[$sokuonOffset])) {
 					return false;
 				}
 				$sokuon .= $nameParts[$sokuonOffset];
 				if (in_array($sokuon, $validRomaji)) {
 					$i += $sokuonOffset;
 					continue 2;
 				}
 			}
 		}

 		// try the next three characters (to allow for chi, shi, tcha, and other longer mora).
 		// We need to start at 1 so that we're looking at the next letter.
 		for ($j=1; $j <= 3; $j++) {
 			if (empty($nameParts[$i + $j])) {
 				return false;
 			}
 			$part .= $nameParts[$i + $j];
 			if (in_array($part, $validRomaji)) {
 				continue 2;
 			}
 		}

 		return false;
 	}

 	return true;
 }
	<?php
	/**
	* Filter for determining of a csv of names is a Japanese name in romaji
	* php name-filter.php ~/Desktop/names.csv`
	*
	* right now it returns the name and a 1 if it looks Japanese for easy verification, but it’s easy to switch that to the actual line of data
	* right now it requires that both the first and last name look Japanese, so it fails on people like Yumiko who are Japanese with western last names
	*
	*/

	if (count($argv) <= 1) {
	die('You must specify a file.');
	}
	$file = $argv[1];

	if (!file_exists($file)) {
	die("The file '$file' does not exist.");
	}

	$fh = fopen($file, 'rb');
	$header = null;
	$validRomaji = [
	'a', 'i', 'u', 'e', 'o',
	'ka', 'ki', 'ku', 'ke', 'ko',
	'kya', 'kyu', 'kyo',
	'ga', 'gi', 'gu', 'ge', 'go',
	'sa', 'shi', 'si', 'su', 'se', 'so',
	'sha', 'sya', 'shu', 'syu', 'sho', 'syo',
	'ja', 'ju', 'jo',
	'za', 'ji', 'zi', 'zu', 'se', 'zo',
	'ta', 'chi', 'ti', 'tsu', 'te', 'to',
	'da', 'di', 'dhi', 'du', 'dzu', 'de', 'do',
	'cha', 'chu', 'cho', 'tcha',
	'na', 'ni', 'nu', 'ne', 'no',
	'nya', 'nyu', 'nyo',
	'ha', 'hi', 'fu', 'hu', 'he', 'ho',
	'hyo', 'hyu', 'hyo',
	'pya', 'pyu', 'pyo',
	'bya', 'byu', 'byo',
	'ba', 'bi', 'bu', 'be', 'bo',
	'pa', 'pi', 'pu', 'pe', 'po',
	'ma', 'mi', 'mu', 'me', 'mo',
	'mya', 'myu', 'myo',
	'ra', 'ri', 'ru', 're', 'ro',
	'rya', 'ryo', 'ryu',
	'ya', 'yu', 'yo',
	'wa', 'wo',
	'n',
	];

	$i = 0;
	while (($row = fgetcsv($fh, 1000, ',')) !== false) {
	if (!$header) {
	$header = array_flip($row);
	print_r($header);
	continue;
	}
	$i++;
	$givenName = $row[$header['Given name']];
	$familyName = $row[$header['Family name']];

	echo "Checking $givenName $familyName...";
	echo isPlausibleJapaneseName($givenName, $validRomaji) && isPlausibleJapaneseName($familyName, $validRomaji);
	echo "\n";
	}

	fclose($fh);

	function isPlausibleJapaneseName($name, $validRomaji) {
	$nameParts = str_split(strtolower(preg_replace('/[^a-zA-Z]+/', '', $name)));
	$nameLength = count($nameParts);

	for ($i=0; $i < $nameLength; $i++) {
	$part = $nameParts[$i];

	if (in_array($part, $validRomaji)) {
	continue;
	}

	// check for sokuon. this is simplified and misses more complex usages,
	// but those should be rare in names.
	if (!empty($nameParts[$i + 1]) && $part == $nameParts[$i + 1]) {
	$sokuon = $nameParts[$i + 1];

	for ($j=1; $j <= 3; $j++) {
	$sokuonOffset = $i + 1 + $j;
	if (empty($nameParts[$sokuonOffset])) {
	return false;
	}
	$sokuon .= $nameParts[$sokuonOffset];
	if (in_array($sokuon, $validRomaji)) {
	$i += $sokuonOffset;
	continue 2;
	}
	}
	}

	// try the next three characters (to allow for chi, shi, tcha, and other longer mora).
	// We need to start at 1 so that we're looking at the next letter.
	for ($j=1; $j <= 3; $j++) {
	if (empty($nameParts[$i + $j])) {
	return false;
	}
	$part .= $nameParts[$i + $j];
	if (in_array($part, $validRomaji)) {
	continue 2;
	}
	}

	return false;
	}

	return true;
	}
No results found