Created
September 16, 2015 05:43
-
-
Save harisenbon/563ef073455cf3f58c9e to your computer and use it in GitHub Desktop.
Detect names as Japanese or Western
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * Filter for determining of a csv of names is a Japanese name in romaji | |
| * php name-filter.php ~/Desktop/names.csv` | |
| * | |
| * right now it returns the name and a 1 if it looks Japanese for easy verification, but it’s easy to switch that to the actual line of data | |
| * right now it requires that both the first and last name look Japanese, so it fails on people like Yumiko who are Japanese with western last names | |
| * | |
| */ | |
| if (count($argv) <= 1) { | |
| die('You must specify a file.'); | |
| } | |
| $file = $argv[1]; | |
| if (!file_exists($file)) { | |
| die("The file '$file' does not exist."); | |
| } | |
| $fh = fopen($file, 'rb'); | |
| $header = null; | |
| $validRomaji = [ | |
| 'a', 'i', 'u', 'e', 'o', | |
| 'ka', 'ki', 'ku', 'ke', 'ko', | |
| 'kya', 'kyu', 'kyo', | |
| 'ga', 'gi', 'gu', 'ge', 'go', | |
| 'sa', 'shi', 'si', 'su', 'se', 'so', | |
| 'sha', 'sya', 'shu', 'syu', 'sho', 'syo', | |
| 'ja', 'ju', 'jo', | |
| 'za', 'ji', 'zi', 'zu', 'se', 'zo', | |
| 'ta', 'chi', 'ti', 'tsu', 'te', 'to', | |
| 'da', 'di', 'dhi', 'du', 'dzu', 'de', 'do', | |
| 'cha', 'chu', 'cho', 'tcha', | |
| 'na', 'ni', 'nu', 'ne', 'no', | |
| 'nya', 'nyu', 'nyo', | |
| 'ha', 'hi', 'fu', 'hu', 'he', 'ho', | |
| 'hyo', 'hyu', 'hyo', | |
| 'pya', 'pyu', 'pyo', | |
| 'bya', 'byu', 'byo', | |
| 'ba', 'bi', 'bu', 'be', 'bo', | |
| 'pa', 'pi', 'pu', 'pe', 'po', | |
| 'ma', 'mi', 'mu', 'me', 'mo', | |
| 'mya', 'myu', 'myo', | |
| 'ra', 'ri', 'ru', 're', 'ro', | |
| 'rya', 'ryo', 'ryu', | |
| 'ya', 'yu', 'yo', | |
| 'wa', 'wo', | |
| 'n', | |
| ]; | |
| $i = 0; | |
| while (($row = fgetcsv($fh, 1000, ',')) !== false) { | |
| if (!$header) { | |
| $header = array_flip($row); | |
| print_r($header); | |
| continue; | |
| } | |
| $i++; | |
| $givenName = $row[$header['Given name']]; | |
| $familyName = $row[$header['Family name']]; | |
| echo "Checking $givenName $familyName..."; | |
| echo isPlausibleJapaneseName($givenName, $validRomaji) && isPlausibleJapaneseName($familyName, $validRomaji); | |
| echo "\n"; | |
| } | |
| fclose($fh); | |
| function isPlausibleJapaneseName($name, $validRomaji) { | |
| $nameParts = str_split(strtolower(preg_replace('/[^a-zA-Z]+/', '', $name))); | |
| $nameLength = count($nameParts); | |
| for ($i=0; $i < $nameLength; $i++) { | |
| $part = $nameParts[$i]; | |
| if (in_array($part, $validRomaji)) { | |
| continue; | |
| } | |
| // check for sokuon. this is simplified and misses more complex usages, | |
| // but those should be rare in names. | |
| if (!empty($nameParts[$i + 1]) && $part == $nameParts[$i + 1]) { | |
| $sokuon = $nameParts[$i + 1]; | |
| for ($j=1; $j <= 3; $j++) { | |
| $sokuonOffset = $i + 1 + $j; | |
| if (empty($nameParts[$sokuonOffset])) { | |
| return false; | |
| } | |
| $sokuon .= $nameParts[$sokuonOffset]; | |
| if (in_array($sokuon, $validRomaji)) { | |
| $i += $sokuonOffset; | |
| continue 2; | |
| } | |
| } | |
| } | |
| // try the next three characters (to allow for chi, shi, tcha, and other longer mora). | |
| // We need to start at 1 so that we're looking at the next letter. | |
| for ($j=1; $j <= 3; $j++) { | |
| if (empty($nameParts[$i + $j])) { | |
| return false; | |
| } | |
| $part .= $nameParts[$i + $j]; | |
| if (in_array($part, $validRomaji)) { | |
| continue 2; | |
| } | |
| } | |
| return false; | |
| } | |
| return true; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment