Created
September 23, 2017 01:18
-
-
Save twairball/2bd45f0f1dbfe3543a34193edfde2f1b to your computer and use it in GitHub Desktop.
WMT17 Zh-En corpus have different number of lines?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import jieba\n", | |
| "import nltk\n", | |
| "import os\n", | |
| "\n", | |
| "\"\"\"\n", | |
| "Notebook exploring the weird misalignment between Zh and En corpus. \n", | |
| "Inspecting the files on unix console we expect 227,330 lines in both corpus. \n", | |
| "However, looping through the file line-by-line in python we find:\n", | |
| " EN: 227568\n", | |
| " ZH: 227603 (diff: 35)\n", | |
| "\n", | |
| "\n", | |
| "On console: \n", | |
| "$ wc -l training/news-commentary-v12.zh-en.zh\n", | |
| "227330 training/news-commentary-v12.zh-en.zh\n", | |
| "\n", | |
| "$ wc -l training/news-commentary-v12.zh-en.en\n", | |
| "227330 training/news-commentary-v12.zh-en.en\n", | |
| "\n", | |
| "\n", | |
| "WMT17 training Dataset corpus can be downloaded from:\n", | |
| "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\"\"\"\n", | |
| "zh_filepath=\"tmp/wmt17_en_zh/training/news-commentary-v12.zh-en.zh\"\n", | |
| "en_filepath=\"tmp/wmt17_en_zh/training/news-commentary-v12.zh-en.en\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 52, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(227603, 227573)" | |
| ] | |
| }, | |
| "execution_count": 52, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "\"\"\" Counting using splitlines. somehow we get a different count for En. \"\"\"\n", | |
| "def count_splitlines(filename):\n", | |
| " return len(open(filename).read().splitlines())\n", | |
| "\n", | |
| "count_splitlines(zh_filepath), count_splitlines(en_filepath)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(227603, 227568)" | |
| ] | |
| }, | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "\"\"\" readlines() matches what we typically expect when reading line-by-line from python. \"\"\"\n", | |
| "def count_readlines(filename):\n", | |
| " return len(open(filename).readlines())\n", | |
| "\n", | |
| "count_readlines(zh_filepath), count_readlines(en_filepath)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "\"\"\" Count lines that are blank. \"\"\"\n", | |
| "def blank_line_count(filename):\n", | |
| " with open(filename) as fd:\n", | |
| " count = sum(1 for line in fd if len(line.strip()) == 0)\n", | |
| " return count" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(39, 146)" | |
| ] | |
| }, | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "blank_line_count(zh_filepath), blank_line_count(en_filepath)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 53, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(0, 0)" | |
| ] | |
| }, | |
| "execution_count": 53, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "\"\"\" find occurences of weird line breaks? \\n, \\r, \\t, \\v\"\"\"\n", | |
| "import re\n", | |
| "def cr_count(filename, substr=\"\\r\"):\n", | |
| " full = open(filename).read()\n", | |
| " return len(re.findall(substr, full))\n", | |
| "\n", | |
| "cr_count(zh_filepath, \"\\r\"), cr_count(en_filepath, \"\\r\") " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 48, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def report_blank_lines(filename):\n", | |
| " tot = 0\n", | |
| " with open(filename) as f:\n", | |
| " for i, l in enumerate(f):\n", | |
| " if len(l.strip()) < 1:\n", | |
| " print(\"[%d] %s[END]\" % (i, l))\n", | |
| " tot = tot + 1\n", | |
| " print(\" total: %d\" % tot)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[27660] \n", | |
| "[END]\n", | |
| "[51225] \n", | |
| "[END]\n", | |
| "[66871] \n", | |
| "[END]\n", | |
| "[75770] \n", | |
| "[END]\n", | |
| "[75775] \n", | |
| "[END]\n", | |
| "[82330] \n", | |
| "[END]\n", | |
| "[89880] \n", | |
| "[END]\n", | |
| "[91075] \n", | |
| "[END]\n", | |
| "[105145] \n", | |
| "[END]\n", | |
| "[119307] \n", | |
| "[END]\n", | |
| "[126515] \n", | |
| "[END]\n", | |
| "[128127] \n", | |
| "[END]\n", | |
| "[137127] \n", | |
| "[END]\n", | |
| "[137604] \n", | |
| "[END]\n", | |
| "[145516] \n", | |
| "[END]\n", | |
| "[146597] \n", | |
| "[END]\n", | |
| "[147274] \n", | |
| "[END]\n", | |
| "[151833] \n", | |
| "[END]\n", | |
| "[166718] \n", | |
| "[END]\n", | |
| "[167566] \n", | |
| "[END]\n", | |
| "[167574] \n", | |
| "[END]\n", | |
| "[167586] \n", | |
| "[END]\n", | |
| "[167591] \n", | |
| "[END]\n", | |
| "[167598] \n", | |
| "[END]\n", | |
| "[172120] \n", | |
| "[END]\n", | |
| "[176885] \n", | |
| "[END]\n", | |
| "[178064] \n", | |
| "[END]\n", | |
| "[178066] \n", | |
| "[END]\n", | |
| "[178643] \n", | |
| "[END]\n", | |
| "[178983] \n", | |
| "[END]\n", | |
| "[178985] \n", | |
| "[END]\n", | |
| "[179010] \n", | |
| "[END]\n", | |
| "[179817] \n", | |
| "[END]\n", | |
| "[180836] \n", | |
| "[END]\n", | |
| "[183407] \n", | |
| "[END]\n", | |
| "[190958] \n", | |
| "[END]\n", | |
| "[193349] \n", | |
| "[END]\n", | |
| "[197577] \n", | |
| "[END]\n", | |
| "[206624] \n", | |
| "[END]\n", | |
| " total: 39\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "report_blank_lines(zh_filepath)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[4088] \n", | |
| "[END]\n", | |
| "[8112] \n", | |
| "[END]\n", | |
| "[13275] \n", | |
| "[END]\n", | |
| "[13276] \n", | |
| "[END]\n", | |
| "[13357] \n", | |
| "[END]\n", | |
| "[13358] \n", | |
| "[END]\n", | |
| "[13581] \n", | |
| "[END]\n", | |
| "[13582] \n", | |
| "[END]\n", | |
| "[13783] \n", | |
| "[END]\n", | |
| "[13784] \n", | |
| "[END]\n", | |
| "[14646] \n", | |
| "[END]\n", | |
| "[14740] \n", | |
| "[END]\n", | |
| "[15454] \n", | |
| "[END]\n", | |
| "[15514] \n", | |
| "[END]\n", | |
| "[15515] \n", | |
| "[END]\n", | |
| "[16244] \n", | |
| "[END]\n", | |
| "[20289] \n", | |
| "[END]\n", | |
| "[23012] \n", | |
| "[END]\n", | |
| "[24964] \n", | |
| "[END]\n", | |
| "[24965] \n", | |
| "[END]\n", | |
| "[27670] \n", | |
| "[END]\n", | |
| "[31465] \n", | |
| "[END]\n", | |
| "[31466] \n", | |
| "[END]\n", | |
| "[32799] \n", | |
| "[END]\n", | |
| "[35079] \n", | |
| "[END]\n", | |
| "[35080] \n", | |
| "[END]\n", | |
| "[37662] \n", | |
| "[END]\n", | |
| "[37663] \n", | |
| "[END]\n", | |
| "[39318] \n", | |
| "[END]\n", | |
| "[39321] \n", | |
| "[END]\n", | |
| "[45101] \n", | |
| "[END]\n", | |
| "[48448] \n", | |
| "[END]\n", | |
| "[48450] \n", | |
| "[END]\n", | |
| "[48451] \n", | |
| "[END]\n", | |
| "[48454] \n", | |
| "[END]\n", | |
| "[49221] \n", | |
| "[END]\n", | |
| "[49222] \n", | |
| "[END]\n", | |
| "[51263] \n", | |
| "[END]\n", | |
| "[55062] \n", | |
| "[END]\n", | |
| "[64458] \n", | |
| "[END]\n", | |
| "[66912] \n", | |
| "[END]\n", | |
| "[70704] \n", | |
| "[END]\n", | |
| "[74897] \n", | |
| "[END]\n", | |
| "[75821] \n", | |
| "[END]\n", | |
| "[75826] \n", | |
| "[END]\n", | |
| "[75975] \n", | |
| "[END]\n", | |
| "[79343] \n", | |
| "[END]\n", | |
| "[80313] \n", | |
| "[END]\n", | |
| "[80691] \n", | |
| "[END]\n", | |
| "[82383] \n", | |
| "[END]\n", | |
| "[82385] \n", | |
| "[END]\n", | |
| "[83542] \n", | |
| "[END]\n", | |
| "[85636] \n", | |
| "[END]\n", | |
| "[88839] \n", | |
| "[END]\n", | |
| "[91118] \n", | |
| "[END]\n", | |
| "[91831] \n", | |
| "[END]\n", | |
| "[91832] \n", | |
| "[END]\n", | |
| "[92274] \n", | |
| "[END]\n", | |
| "[98705] \n", | |
| "[END]\n", | |
| "[100113] \n", | |
| "[END]\n", | |
| "[102805] \n", | |
| "[END]\n", | |
| "[103524] \n", | |
| "[END]\n", | |
| "[103525] \n", | |
| "[END]\n", | |
| "[103531] \n", | |
| "[END]\n", | |
| "[103532] \n", | |
| "[END]\n", | |
| "[104059] \n", | |
| "[END]\n", | |
| "[105195] \n", | |
| "[END]\n", | |
| "[105196] \n", | |
| "[END]\n", | |
| "[105204] \n", | |
| "[END]\n", | |
| "[109823] \n", | |
| "[END]\n", | |
| "[112173] \n", | |
| "[END]\n", | |
| "[112174] \n", | |
| "[END]\n", | |
| "[112522] \n", | |
| "[END]\n", | |
| "[114362] \n", | |
| "[END]\n", | |
| "[116957] \n", | |
| "[END]\n", | |
| "[116958] \n", | |
| "[END]\n", | |
| "[119007] \n", | |
| "[END]\n", | |
| "[120015] \n", | |
| "[END]\n", | |
| "[121140] \n", | |
| "[END]\n", | |
| "[121142] \n", | |
| "[END]\n", | |
| "[121147] \n", | |
| "[END]\n", | |
| "[123323] \n", | |
| "[END]\n", | |
| "[123324] \n", | |
| "[END]\n", | |
| "[126575] \n", | |
| "[END]\n", | |
| "[127835] \n", | |
| "[END]\n", | |
| "[127836] \n", | |
| "[END]\n", | |
| "[128796] \n", | |
| "[END]\n", | |
| "[128803] \n", | |
| "[END]\n", | |
| "[133237] \n", | |
| "[END]\n", | |
| "[141777] \n", | |
| "[END]\n", | |
| "[142861] \n", | |
| "[END]\n", | |
| "[142895] \n", | |
| "[END]\n", | |
| "[144866] \n", | |
| "[END]\n", | |
| "[145908] \n", | |
| "[END]\n", | |
| "[146305] \n", | |
| "[END]\n", | |
| "[146306] \n", | |
| "[END]\n", | |
| "[146751] \n", | |
| "[END]\n", | |
| "[147268] \n", | |
| "[END]\n", | |
| "[147269] \n", | |
| "[END]\n", | |
| "[147881] \n", | |
| "[END]\n", | |
| "[151364] \n", | |
| "[END]\n", | |
| "[151905] \n", | |
| "[END]\n", | |
| "[156970] \n", | |
| "[END]\n", | |
| "[162701] \n", | |
| "[END]\n", | |
| "[164167] \n", | |
| "[END]\n", | |
| "[166196] \n", | |
| "[END]\n", | |
| "[166202] \n", | |
| "[END]\n", | |
| "[166791] \n", | |
| "[END]\n", | |
| "[167671] \n", | |
| "[END]\n", | |
| "[169043] \n", | |
| "[END]\n", | |
| "[169044] \n", | |
| "[END]\n", | |
| "[172187] \n", | |
| "[END]\n", | |
| "[172202] \n", | |
| "[END]\n", | |
| "[174471] \n", | |
| "[END]\n", | |
| "[174472] \n", | |
| "[END]\n", | |
| "[177674] \n", | |
| "[END]\n", | |
| "[178739] \n", | |
| "[END]\n", | |
| "[179035] \n", | |
| "[END]\n", | |
| "[179036] \n", | |
| "[END]\n", | |
| "[179379] \n", | |
| "[END]\n", | |
| "[179808] \n", | |
| "[END]\n", | |
| "[180869] \n", | |
| "[END]\n", | |
| "[182652] \n", | |
| "[END]\n", | |
| "[184332] \n", | |
| "[END]\n", | |
| "[184333] \n", | |
| "[END]\n", | |
| "[184483] \n", | |
| "[END]\n", | |
| "[184484] \n", | |
| "[END]\n", | |
| "[190377] \n", | |
| "[END]\n", | |
| "[191032] \n", | |
| "[END]\n", | |
| "[191407] \n", | |
| "[END]\n", | |
| "[192485] \n", | |
| "[END]\n", | |
| "[192486] \n", | |
| "[END]\n", | |
| "[195724] \n", | |
| "[END]\n", | |
| "[197527] \n", | |
| "[END]\n", | |
| "[199217] \n", | |
| "[END]\n", | |
| "[199218] \n", | |
| "[END]\n", | |
| "[199819] \n", | |
| "[END]\n", | |
| "[202672] \n", | |
| "[END]\n", | |
| "[211684] \n", | |
| "[END]\n", | |
| "[214254] \n", | |
| "[END]\n", | |
| "[216153] \n", | |
| "[END]\n", | |
| "[216416] \n", | |
| "[END]\n", | |
| "[216638] \n", | |
| "[END]\n", | |
| "[217317] \n", | |
| "[END]\n", | |
| "[221007] \n", | |
| "[END]\n", | |
| "[225697] \n", | |
| "[END]\n", | |
| " total: 146\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "report_blank_lines(en_filepath)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment