Skip to content

Instantly share code, notes, and snippets.

@naiaden
Last active November 9, 2016 15:57
Show Gist options
  • Select an option

  • Save naiaden/497afff85e205d47079f3ff73d5c4904 to your computer and use it in GitHub Desktop.

Select an option

Save naiaden/497afff85e205d47079f3ff73d5c4904 to your computer and use it in GitHub Desktop.
preprocessing
export W=2
export T=2
export t=2
export SAMPLES=10
export DOMAIN=$(dnsdomainname)
if [ ${DOMAIN} = "science.ru.nl" ]; then
OUTPUTPREFIX=/scratch/lonrust/cococpypv3
COCOCPYPPREFIX=/home/lonrust/Software/cococpyp
else
OUTPUTPREFIX=/esat/spchdisk/scratch/onrust/cococpypv3
COCOCPYPPREFIX=/users/spraak/onrust/software/cococpyp
fi
export OUTPUTDIR=${OUTPUTPREFIX}/output
export INPUTDIR=${OUTPUTPREFIX}/input
export DERIVEDDIR=${OUTPUTPREFIX}/derived
export MODELDIR=${OUTPUTPREFIX}/models
export COCOCPYPDIR=${COCOCPYPPREFIX}/hpyplm
export TEST=${1:-jrc}
export TRAIN=1bw
export DEBUG=NONE
export BACKOFF=${2:-uni}
export PREFIX=${BACKOFF}--
export W=100
if [ ${BACKOFF} = "mle" ]; then
LIMITEDCACHEFILE=limitedmlecachefile;
elif [ ${BACKOFF} = "ent" ]; then
LIMITEDCACHEFILE=limitedentropycachefile;
else
LIMITEDCACHEFILE=limiteduniformcachefile;
fi
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train.copamo --debug ${DEBUG} -B ${BACKOFF} -L ${OUTPUTDIR}/${PREFIX}${TRAIN}-4gramsm_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --${LIMITEDCACHEFILE} ${DERIVEDDIR}/${BACKOFF}.dummy
W=2
T=2
t=2
SIZE=100000
SIZES=100k
head -n ${SIZE} ~/data/cococpypv2/input/train-1bw.4gram-sm.txt > ~/data/cococpypv2/input/train-1bw.${SIZES}.4gram-sm.txt
for i in 1 2 3 4 5; do
colibri-freqlist -t ${t} -m ${i} -l ${i} -s ~/data/cococpypv2/input/train-1bw.${SIZES}.4gram-sm.txt > ~/data/cococpypv2/derived/train-1bw-${SIZES}-4gramsm-t2-s.${i};
done
~/Software/cococpyp/hpyplm/hpyplm_train4 -o ~/data/cococpypv2/models -m 1bw-4gramsm-${SIZES} -f ~/data/cococpypv2/input/train-1bw.${SIZES}.4gram-sm.txt -s 10 -T ${T} -t ${t} -W ${W}
PPWD=$(pwd); cd ~/Software/cococpyp/hpyplm; rm hq4; make hq4; cd ${PPWD}; tput reset
~/Software/cococpyp/hpyplm/hq4 -o ~/data/cococpypv2/models -m 1bw-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s10_p0_v2_train -M 1bw-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s10_p0_v2-1bw -F ~/data/cococpypv2/input/test-1bw.4gram-sm.txt -O ~/data/cococpypv2/output -C ~/data/cococpypv2/derived/train-1bw-${SIZES}-4gramsm-t${t}-s -L ~/data/cococpypv2/output/1bw-4gramsm-100k_4_W2_t2_T2_s10_p0_v2-1bw_ngram-common_4.lim
cut -d' ' -f4- fv800001.0.SPKR02-5194-6768.10000-best.txt | sed 's/^/<s> <s> /' > fv800001.0.SPKR02-5194-6768.10000-best.4gram-sm.txt
colibri-patternmodeller -c /scratch/lonrust/cococpypv3/models/mediargus_4S_W250_t2_T2_s10_p0_v2_train.cls -f /scratch/lonrust/cococpypv3/models/mediargus_4S_W250_t2_T2_s10_p0_v2_train.dat -t 2 -m 1 -l 5 -W 250 -s -P > /scratch/lonrust/cococpypv3/output/mediargus_4S_W250_t2_T2_s10_p0_v2_train.copamo
cut -f1,2 /scratch/lonrust/cococpypv3/output/mediargus_4S_W250_t2_T2_s10_p0_v2_train.copamo | tail -n +2 | sort -T /scratch/lonrust --parallel=16 -k2,2rn -k1,1 -t $'\t' > /scratch/lonrust/cococpypv3/output/mediargus_4S_W250_t2_T2_s10_p0_v2_train.copamo.sorted
for i in 1 2 3 4 5; do grep -P "^[^ ]+( [^ ]+){$((i-1))}\t" /scratch/lonrust/cococpypv3/output/mediargus_4S_W250_t2_T2_s10_p0_v2_train.copamo.sorted > /scratch/lonrust/cococpypv3/output/mediargus_4S_W250_t2_T2_s10_p0_v2_train.copamo.$i; done
export W=2
export T=2
export t=2
export SAMPLES=10
if [ $(dnsdomainname) = "science.ru.nl" ]; then
OUTPUTPREFIX=/scratch/lonrust/cococpypv3
COCOCPYPPREFIX=/home/lonrust/Software/cococpyp
else
OUTPUTPREFIX=/esat/spchdisk/scratch/onrust/cococpypv3
COCOCPYPPREFIX=/users/spraak/onrust/software/cococpyp
fi
export OUTPUTDIR=${OUTPUTPREFIX}/output
export INPUTDIR=${OUTPUTPREFIX}/input
export DERIVEDDIR=${OUTPUTPREFIX}/derived
export MODELDIR=${OUTPUTPREFIX}/models
export COCOCPYPDIR=${COCOCPYPPREFIX}/hpyplm
export TEST=jrc
export TRAIN=1bw
export DEBUG=NONE
ONLYTEST=true
function run_subset {
export SIZE=100000
export SIZES=100k
if [ ${ONLYTEST} = "false" ]; then
head -n ${SIZE} ${INPUTDIR}/train-${TRAIN}.4gram-sm.txt > ${DERIVEDDIR}/train-${TRAIN}.${SIZES}.4gram-sm.txt
parallel --env SIZES --env t --env DERIVEDDIR 'colibri-freqlist -t ${t} -m {1} -l {1} -s ${DERIVEDDIR}/train-${TRAIN}.${SIZES}.4gram-sm.txt > ${DERIVEDDIR}/train-${TRAIN}-${SIZES}-4gramsm-t${t}-s.{1}' ::: 1 2 3 4 5
PPWD=$(pwd); cd ${COCOCPYPDIR}; \rm hq4; make hq4; \rm ht4; make ht4; cd ${PPWD}; tput reset
${COCOCPYPDIR}/ht4 -o ${MODELDIR} -m ${TRAIN}-4gramsm-${SIZES} -f ${DERIVEDDIR}/train-${TRAIN}.${SIZES}.4gram-sm.txt -s ${SAMPLES} -T ${T} -t ${t} -W ${W}
fi
BACKOFF=uni
PREFIX=${BACKOFF}--
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/train-${TRAIN}-${SIZES}-4gramsm-t${t}-s -L ${OUTPUTDIR}/${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --debug ${DEBUG} -B ${BACKOFF}
BACKOFF=mle
PREFIX=${BACKOFF}--
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/train-${TRAIN}-${SIZES}-4gramsm-t${t}-s -L ${OUTPUTDIR}/${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --backoff ${BACKOFF} --debug ${DEBUG}
BACKOFF=ent
PREFIX=${BACKOFF}--
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/train-${TRAIN}-${SIZES}-4gramsm-t${t}-s -L ${OUTPUTDIR}/${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --backoff ${BACKOFF} --debug ${DEBUG}
}
function run_all {
if [ ${ONLYTEST} = "false" ]; then
#parallel --env SIZES --env t --env DERIVEDDIR 'colibri-freqlist -t ${t} -m {1} -l {1} -s ${INPUTDIR}/train-${TRAIN}.4gram-sm.txt > ${DERIVEDDIR}/train-${TRAIN}-4gramsm-t${t}-s.{1}' ::: 1 2 3 4 5
# 1bw_4S_W100_t2_T2_s10_p0_v2_train.copamo
PPWD=$(pwd); cd ${COCOCPYPDIR}; \rm hq4; make hq4; \rm ht4; make ht4; cd ${PPWD}; tput reset
${COCOCPYPDIR}/ht4 -o ${MODELDIR} -m ${TRAIN}-4gramsm -f ${DERIVEDDIR}/train-${TRAIN}.4gram-sm.txt -s ${SAMPLES} -T ${T} -t ${t} -W ${W}
fi
export BACKOFF=uni
export PREFIX=${BACKOFF}--
export W=100
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train.copamo --debug ${DEBUG} -B ${BACKOFF} -L ${OUTPUTDIR}/${PREFIX}${TRAIN}-4gramsm_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --limiteduniformcachefile ${DERIVEDDIR}/uni.dummy
export BACKOFF=mle
export PREFIX=${BACKOFF}--
export W=100
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train.copamo --debug ${DEBUG} -B ${BACKOFF} -L ${OUTPUTDIR}/${PREFIX}${TRAIN}-4gramsm_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --limitedmlecachefile ${DERIVEDDIR}/mle.dummy
export BACKOFF=ent
export PREFIX=${BACKOFF}--
export W=100
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/${TRAIN}_4S_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train.copamo --debug ${DEBUG} -B ${BACKOFF} -L ${OUTPUTDIR}/${PREFIX}${TRAIN}-4gramsm_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --limitedentropycachefile ${DERIVEDDIR}/ent.dummy
}
SUBSET=true
if [ ${SUBSET} = "true" ]; then
rub_subset
else
run_all
fi
head -n ${SIZE} ${INPUTDIR}/train-${TRAIN}.4gram-sm.txt > ${DERIVEDDIR}/train-${TRAIN}.${SIZES}.4gram-sm.txt
parallel --env SIZES --env t --env DERIVEDDIR 'colibri-freqlist -t ${t} -m {1} -l {1} -s ${DERIVEDDIR}/train-${TRAIN}.${SIZES}.4gram-sm.txt > ${DERIVEDDIR}/train-${TRAIN}-${SIZES}-4gramsm-t${t}-s.{1}' ::: 1 2 3 4 5
PPWD=$(pwd); cd ${COCOCPYPDIR}; \rm hq4; make hq4; \rm ht4; make ht4; cd ${PPWD}; tput reset
${COCOCPYPDIR}/ht4 -o ${MODELDIR} -m ${TRAIN}-4gramsm-${SIZES} -f ${DERIVEDDIR}/train-${TRAIN}.${SIZES}.4gram-sm.txt -s ${SAMPLES} -T ${T} -t ${t} -W ${W}
BACKOFF=uni
PREFIX=${BACKOFF}--
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/train-${TRAIN}-${SIZES}-4gramsm-t${t}-s -L ${OUTPUTDIR}/${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --debug ${DEBUG} -B ${BACKOFF}
BACKOFF=mle
PREFIX=${BACKOFF}--
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/train-${TRAIN}-${SIZES}-4gramsm-t${t}-s -L ${OUTPUTDIR}/${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --backoff ${BACKOFF} --debug ${DEBUG}
BACKOFF=ent
PREFIX=${BACKOFF}--
${COCOCPYPDIR}/hq4 -o ${MODELDIR} -m ${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2_train -M ${PREFIX}${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST} -F ${INPUTDIR}/test-${TEST}.4gram-sm.txt -O ${OUTPUTDIR} -C ${DERIVEDDIR}/train-${TRAIN}-${SIZES}-4gramsm-t${t}-s -L ${OUTPUTDIR}/${TRAIN}-4gramsm-${SIZES}_4_W${W}_t${t}_T${T}_s${SAMPLES}_p0_v2-${TEST}_ngram-common_4.lim --backoff ${BACKOFF} --debug ${DEBUG}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment