Skip to content

Instantly share code, notes, and snippets.

@naiaden
Last active December 22, 2016 16:27
Show Gist options
  • Select an option

  • Save naiaden/a25a50c323a567f51692424d6c0a7594 to your computer and use it in GitHub Desktop.

Select an option

Save naiaden/a25a50c323a567f51692424d6c0a7594 to your computer and use it in GitHub Desktop.
Count backoff depth
import sys
unigrams = {}
bigrams = {}
trigrams = {}
quadgrams = {}
with open('/scratch/lonrust/cococpypv3/fixed_derived/1bw_4S_W100_t2_T2_s10_p0_v2_train.copamo.1', 'r') as f:
for line in f:
tokens = line.strip().split("\t")
unigrams[tokens[0]] = tokens[1]
with open('/scratch/lonrust/cococpypv3/fixed_derived/1bw_4S_W100_t2_T2_s10_p0_v2_train.copamo.2', 'r') as f:
for line in f:
tokens = line.strip().split("\t")
bigrams[tokens[0]] = tokens[1]
with open('/scratch/lonrust/cococpypv3/fixed_derived/1bw_4S_W100_t2_T2_s10_p0_v2_train.copamo.3', 'r') as f:
for line in f:
tokens = line.strip().split("\t")
trigrams[tokens[0]] = tokens[1]
with open('/scratch/lonrust/cococpypv3/fixed_derived/1bw_4S_W100_t2_T2_s10_p0_v2_train.copamo.4', 'r') as f:
for line in f:
tokens = line.strip().split("\t")
quadgrams[tokens[0]] = tokens[1]
unicounts = 0
bicounts = 0
tricounts = 0
quadcounts = 0
with open(sys.argv[1], 'r') as f:
for line_nr, line in enumerate(f):
if line_nr % 10000:
print(unicounts, bicounts, tricounts, quadcounts)
pattern = line.strip()
if pattern in quadgrams:
quadcounts += 1
else:
tokens = pattern.split(" ")
a_cd = ' '.join(tokens[0:1] + ["{*}"] + tokens[2:])
if a_cd in quadgrams:
tricounts += 1
else:
a__d = ' '.join(tokens[0:1] + ["{*} {*}"] + tokens[3:])
if a__d in quadgrams:
bicounts += 1
else:
unicounts += 1
cd = ' '.join(tokens[2:])
if cd in bigrams:
bicounts += 1
else:
unicounts += 1
ab_d = ' '.join(tokens[0:2] + ["{*} {*}"] + tokens[3:])
if ab_d in quadgrams:
tricounts += 1
else:
a__d = ' '.join(tokens[0:1] + ["{*} {*}"] + tokens[3:])
if a__d in quadgrams:
bicounts += 1
else:
unicounts += 1
b_d = ' '.join(tokens[1:2] + ["{*} {*}"] + tokens[3:])
if b_d in bigrams:
bicounts += 1
else:
unicounts += 1
bcd = ' '.join(tokens[1:])
if bcd in trigrams:
tricounts += 1
else:
cd = ' '.join(tokens[2:])
if cd in bigrams:
bicounts += 1
else:
unicounts += 1
b_d = ' '.join(tokens[1:2] + ["{*} {*}"] + tokens[3:])
if b_d in bigrams:
bicounts += 1
else:
unicounts += 1
print(unicounts, bicounts, tricounts, quadcounts)
623103048 473545222 134318614 103204440
get patterns from prob file:
fancypants% grep -v '^\*\*\*' ent--1bw_4S_W100_t2_T2_s10_p0_v2-wp_full-ent.lim-ent-common_4_fullnaive_4_entropy.probs | sed 's/^p(//' | awk -F" = " '{print $1}' | sed -rn 's/([^ ]+) \|([^ ]+) ([^ ]+) ([^ ]+)\)$/\2 \3 \4 \1/p' > /scratch/lonrust/aaaaaaaall4grams.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment