Last active
December 22, 2016 16:27
-
-
Save naiaden/a25a50c323a567f51692424d6c0a7594 to your computer and use it in GitHub Desktop.
Count backoff depth
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| unigrams = {} | |
| bigrams = {} | |
| trigrams = {} | |
| quadgrams = {} | |
| with open('/scratch/lonrust/cococpypv3/fixed_derived/1bw_4S_W100_t2_T2_s10_p0_v2_train.copamo.1', 'r') as f: | |
| for line in f: | |
| tokens = line.strip().split("\t") | |
| unigrams[tokens[0]] = tokens[1] | |
| with open('/scratch/lonrust/cococpypv3/fixed_derived/1bw_4S_W100_t2_T2_s10_p0_v2_train.copamo.2', 'r') as f: | |
| for line in f: | |
| tokens = line.strip().split("\t") | |
| bigrams[tokens[0]] = tokens[1] | |
| with open('/scratch/lonrust/cococpypv3/fixed_derived/1bw_4S_W100_t2_T2_s10_p0_v2_train.copamo.3', 'r') as f: | |
| for line in f: | |
| tokens = line.strip().split("\t") | |
| trigrams[tokens[0]] = tokens[1] | |
| with open('/scratch/lonrust/cococpypv3/fixed_derived/1bw_4S_W100_t2_T2_s10_p0_v2_train.copamo.4', 'r') as f: | |
| for line in f: | |
| tokens = line.strip().split("\t") | |
| quadgrams[tokens[0]] = tokens[1] | |
| unicounts = 0 | |
| bicounts = 0 | |
| tricounts = 0 | |
| quadcounts = 0 | |
| with open(sys.argv[1], 'r') as f: | |
| for line_nr, line in enumerate(f): | |
| if line_nr % 10000: | |
| print(unicounts, bicounts, tricounts, quadcounts) | |
| pattern = line.strip() | |
| if pattern in quadgrams: | |
| quadcounts += 1 | |
| else: | |
| tokens = pattern.split(" ") | |
| a_cd = ' '.join(tokens[0:1] + ["{*}"] + tokens[2:]) | |
| if a_cd in quadgrams: | |
| tricounts += 1 | |
| else: | |
| a__d = ' '.join(tokens[0:1] + ["{*} {*}"] + tokens[3:]) | |
| if a__d in quadgrams: | |
| bicounts += 1 | |
| else: | |
| unicounts += 1 | |
| cd = ' '.join(tokens[2:]) | |
| if cd in bigrams: | |
| bicounts += 1 | |
| else: | |
| unicounts += 1 | |
| ab_d = ' '.join(tokens[0:2] + ["{*} {*}"] + tokens[3:]) | |
| if ab_d in quadgrams: | |
| tricounts += 1 | |
| else: | |
| a__d = ' '.join(tokens[0:1] + ["{*} {*}"] + tokens[3:]) | |
| if a__d in quadgrams: | |
| bicounts += 1 | |
| else: | |
| unicounts += 1 | |
| b_d = ' '.join(tokens[1:2] + ["{*} {*}"] + tokens[3:]) | |
| if b_d in bigrams: | |
| bicounts += 1 | |
| else: | |
| unicounts += 1 | |
| bcd = ' '.join(tokens[1:]) | |
| if bcd in trigrams: | |
| tricounts += 1 | |
| else: | |
| cd = ' '.join(tokens[2:]) | |
| if cd in bigrams: | |
| bicounts += 1 | |
| else: | |
| unicounts += 1 | |
| b_d = ' '.join(tokens[1:2] + ["{*} {*}"] + tokens[3:]) | |
| if b_d in bigrams: | |
| bicounts += 1 | |
| else: | |
| unicounts += 1 | |
| print(unicounts, bicounts, tricounts, quadcounts) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| 623103048 473545222 134318614 103204440 | |
| get patterns from prob file: | |
| fancypants% grep -v '^\*\*\*' ent--1bw_4S_W100_t2_T2_s10_p0_v2-wp_full-ent.lim-ent-common_4_fullnaive_4_entropy.probs | sed 's/^p(//' | awk -F" = " '{print $1}' | sed -rn 's/([^ ]+) \|([^ ]+) ([^ ]+) ([^ ]+)\)$/\2 \3 \4 \1/p' > /scratch/lonrust/aaaaaaaall4grams.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment