From cbc8eeb3a4e75fa4d3679ece969a9a65c99bb8ad Mon Sep 17 00:00:00 2001 From: saikiran valluri Date: Tue, 19 Feb 2019 00:02:18 -0500 Subject: [PATCH 01/49] Spanish Gigaword LM recipe --- .../s5_gigaword/cmd.sh | 15 + .../s5_gigaword/conf/decode.config | 6 + .../s5_gigaword/conf/mfcc.conf | 2 + .../s5_gigaword/conf/mfcc_hires.conf | 10 + .../s5_gigaword/conf/online_cmvn.conf | 1 + .../s5_gigaword/conf/plp.conf | 2 + .../local/callhome_create_splits.sh | 31 + .../s5_gigaword/local/callhome_data_prep.sh | 163 ++++ .../s5_gigaword/local/callhome_get_1_best.py | 75 ++ .../local/callhome_get_lattices.py | 115 +++ .../local/callhome_make_spk2gender.sh | 29 + .../s5_gigaword/local/callhome_make_trans.pl | 74 ++ .../s5_gigaword/local/callhome_text_pp.sh | 9 + .../s5_gigaword/local/chain/run_tdnn_1g.sh | 288 +++++++ .../s5_gigaword/local/clean_txt_dir.sh | 51 ++ .../s5_gigaword/local/create_oracle_ctm.sh | 30 + .../s5_gigaword/local/create_splits.sh | 30 + .../s5_gigaword/local/ctm.sh | 34 + .../s5_gigaword/local/decode_report.py | 148 ++++ .../s5_gigaword/local/find_unique_phones.pl | 25 + .../s5_gigaword/local/fix_stm.sh | 10 + .../flatten_gigaword/flatten_all_gigaword.sh | 15 + .../flatten_gigaword/flatten_one_gigaword.py | 61 ++ .../local/flatten_gigaword/run_flat.sh | 17 + .../s5_gigaword/local/fsp_create_test_lang.sh | 49 ++ .../s5_gigaword/local/fsp_data_prep.sh | 175 ++++ .../local/fsp_ideal_data_partitions.pl | 85 ++ .../s5_gigaword/local/fsp_make_spk2gender.sh | 29 + .../s5_gigaword/local/fsp_make_trans.pl | 81 ++ .../s5_gigaword/local/fsp_prepare_dict.sh | 142 ++++ .../s5_gigaword/local/fsp_train_lms.sh | 140 ++++ .../s5_gigaword/local/get_1_best.py | 62 ++ .../s5_gigaword/local/get_data_weights.pl | 39 + .../s5_gigaword/local/get_lattices.py | 115 +++ .../s5_gigaword/local/get_oracle.sh | 32 + .../s5_gigaword/local/isolate_phones.pl | 66 ++ .../s5_gigaword/local/latconvert.sh | 124 +++ .../s5_gigaword/local/merge_lexicons.py | 65 ++ .../s5_gigaword/local/monitor_denlats.sh | 31 + .../local/nnet3/run_ivector_common.sh | 187 +++++ .../s5_gigaword/local/pocolm_cust.sh | 117 +++ .../s5_gigaword/local/process_oracle.py | 64 ++ .../s5_gigaword/local/rescore.sh | 24 + .../s5_gigaword/local/rnnlm.sh | 84 ++ .../s5_gigaword/local/run_norm.sh | 33 + .../s5_gigaword/local/run_sgmm2x.sh | 57 ++ .../s5_gigaword/local/score.sh | 1 + .../s5_gigaword/local/score_oracle.sh | 29 + .../s5_gigaword/local/splits/dev | 20 + .../local/splits/split_callhome/dev | 20 + .../local/splits/split_callhome/test | 20 + .../local/splits/split_callhome/train | 80 ++ .../s5_gigaword/local/splits/split_fisher/dev | 20 + .../local/splits/split_fisher/dev2 | 20 + .../local/splits/split_fisher/test | 20 + .../local/splits/split_fisher/train | 759 ++++++++++++++++++ .../s5_gigaword/local/splits/test | 20 + .../s5_gigaword/local/splits/train | 80 ++ .../s5_gigaword/local/spron.pl | 304 +++++++ .../s5_gigaword/local/subset_data_prep.sh | 164 ++++ .../s5_gigaword/local/train_get_1_best.py | 79 ++ .../s5_gigaword/local/train_get_lattices.py | 125 +++ .../s5_gigaword/local/train_pocolm.sh | 39 + .../s5_gigaword/local/train_process_oracle.py | 79 ++ .../s5_gigaword/local/wer_output_filter | 5 + .../s5_gigaword/path.sh | 13 + .../s5_gigaword/path_venv.sh | 13 + egs/fisher_callhome_spanish/s5_gigaword/rnnlm | 1 + .../s5_gigaword/run.sh | 299 +++++++ egs/fisher_callhome_spanish/s5_gigaword/steps | 1 + egs/fisher_callhome_spanish/s5_gigaword/utils | 1 + 71 files changed, 5254 insertions(+) create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/cmd.sh create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh create mode 120000 egs/fisher_callhome_spanish/s5_gigaword/local/score.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/test create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/train create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/path.sh create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh create mode 120000 egs/fisher_callhome_spanish/s5_gigaword/rnnlm create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/run.sh create mode 120000 egs/fisher_callhome_spanish/s5_gigaword/steps create mode 120000 egs/fisher_callhome_spanish/s5_gigaword/utils diff --git a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh new file mode 100755 index 00000000000..0511bd2bbb0 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="retry.pl queue.pl" +export decode_cmd="retry.pl queue.pl --mem 8G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config b/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config new file mode 100644 index 00000000000..7908f178373 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config @@ -0,0 +1,6 @@ +# Use wider-than-normal decoding beams. +first_beam=16.0 +beam=20.0 +lat_beam=10.0 +min_lmwt=2 +max_lmwt=10 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf new file mode 100644 index 00000000000..ffb41a1aae4 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +--sample-frequency=8000 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf new file mode 100644 index 00000000000..d870ab04c38 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf new file mode 100644 index 00000000000..c4b73674cab --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf @@ -0,0 +1,2 @@ +# No non-default options for now. + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh new file mode 100755 index 00000000000..07814da46a9 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +data_dir=data +train_all=data/callhome_train_all + +if [ $# -lt 1 ]; then + echo "Specify the location of the split files" + exit 1; +fi + +splitFile=$1 + +# Train first +for split in train dev test +do + dirName=callhome_$split + + cp -r $train_all $data_dir/$dirName + + awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \ + $splitFile/$split $train_all/segments > $data_dir/$dirName/segments + + n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $data_dir/$dirName/segments | sort | uniq | wc -l` + + echo "$n conversations left in split $dirName" + + utils/fix_data_dir.sh $data_dir/$dirName + utils/validate_data_dir.sh $data_dir/$dirName +done + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh new file mode 100755 index 00000000000..f61b0fa9519 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh @@ -0,0 +1,163 @@ +#!/bin/bash +# +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# The input is the Callhome Spanish Dataset. (*.sph files) +# In addition the transcripts are needed as well. +# To be run from one directory above this script. + +# Note: when creating your own data preparation scripts, it's a good idea +# to make sure that the speaker id (if present) is a prefix of the utterance +# id, that the output scp file is sorted on utterance id, and that the +# transcription file is exactly the same length as the scp file and is also +# sorted on utterance id (missing transcriptions should be removed from the +# scp file using e.g. scripts/filter_scp.pl) + +stage=0 + +export LC_ALL=C + + +if [ $# -lt 2 ]; then + echo "Arguments should be the location of the Callhome Spanish Speech and Transcript Directories, se +e ../run.sh for example." + exit 1; +fi + +cdir=`pwd` +dir=`pwd`/data/local/data +local=`pwd`/local +utils=`pwd`/utils +tmpdir=`pwd`/data/local/tmp + +. ./path.sh || exit 1; # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi +cd $dir + +# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command +# line arguments being absolute pathnames. +#rm -r links/ 2>/dev/null +mkdir -p links/ +ln -s $* links + +# Basic spot checks to see if we got the data that we needed +if [ ! -d links/LDC96S35 -o ! -d links/LDC96T17 ]; +then + echo "The speech and the data directories need to be named LDC96S35 and LDC96T17 respecti +vely" + exit 1; +fi + +if [ ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN ]; +then + echo "Dev, Eval or Train directories missing or not properly organised within the speech data dir" + exit 1; +fi + +#Check the transcripts directories as well to see if they exist +if [ ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/train ] +then + echo "Transcript directories missing or not properly organised" + exit 1; +fi + +speech_train=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN +speech_dev=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST +speech_test=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST +transcripts_train=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/train +transcripts_dev=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest +transcripts_test=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest + +fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l` +fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l` +fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l` +fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l` +fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l` +fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l` + +#Now check if we got all the files that we needed +if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ]; +then + echo "Incorrect number of files in the data directories" + echo "The paritions should contain 80/20/20 files" + exit 1; +fi + +if [ $stage -le 0 ]; then + #Gather all the speech files together to create a file list + ( + find $speech_train -iname '*.sph'; + find $speech_dev -iname '*.sph'; + find $speech_test -iname '*.sph'; + ) > $tmpdir/callhome_train_sph.flist + + #Get all the transcripts in one place + + ( + find $transcripts_train -iname '*.txt'; + find $transcripts_dev -iname '*.txt'; + find $transcripts_test -iname '*.txt'; + ) > $tmpdir/callhome_train_transcripts.flist + +fi + +if [ $stage -le 1 ]; then + $local/callhome_make_trans.pl $tmpdir + mkdir -p $dir/callhome_train_all + mv $tmpdir/callhome_reco2file_and_channel $dir/callhome_train_all/ +fi + +if [ $stage -le 2 ]; then + sort $tmpdir/callhome.text.1 | sed 's/^\s\s*|\s\s*$//g' | sed 's/\s\s*/ /g' > $dir/callhome_train_all/callhome.text + + #Create segments file and utt2spk file + ! cat $dir/callhome_train_all/callhome.text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/callhome_train_all/callhome_utt2spk \ + && echo "Error producing utt2spk file" && exit 1; + + cat $dir/callhome_train_all/callhome.text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; + $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' >$dir/callhome_train_all/callhome_segments + + $utils/utt2spk_to_spk2utt.pl <$dir/callhome_train_all/callhome_utt2spk > $dir/callhome_train_all/callhome_spk2utt +fi + +if [ $stage -le 3 ]; then + for f in `cat $tmpdir/callhome_train_sph.flist`; do + # convert to absolute path + make_absolute.sh $f + done > $tmpdir/callhome_train_sph_abs.flist + + cat $tmpdir/callhome_train_sph_abs.flist | perl -ane 'm:/([^/]+)\.SPH$: || die "bad line $_; "; print lc($1)," $_"; ' > $tmpdir/callhome_sph.scp + cat $tmpdir/callhome_sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \ + sort -k1,1 -u > $dir/callhome_train_all/callhome_wav.scp || exit 1; +fi + +if [ $stage -le 4 ]; then + # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl. + cd $cdir + #TODO: needs to be rewritten + $local/callhome_make_spk2gender.sh > $dir/callhome_train_all/callhome_spk2gender +fi + +# Rename files from the callhome directory +if [ $stage -le 5 ]; then + cd $dir/callhome_train_all + mv callhome.text text + mv callhome_segments segments + mv callhome_spk2utt spk2utt + mv callhome_wav.scp wav.scp + mv callhome_reco2file_and_channel reco2file_and_channel + mv callhome_spk2gender spk2gender + mv callhome_utt2spk utt2spk + cd $cdir +fi + +fix_data_dir.sh $dir/callhome_train_all || exit 1 +utils/validate_data_dir.sh --no-feats $dir/callhome_train_all || exit 1 + +echo "CALLHOME spanish Data preparation succeeded." + +exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py new file mode 100755 index 00000000000..a81818c2858 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# Extracts one best output for a set of files +# The list of files in the conversations for which 1 best output has to be extracted +# words.txt + +import os +import sys + +def findTranscription(timeDetail): + file1 = open('exp/tri5a/decode_callhome_dev/scoring/13.tra') + file2 = open('exp/tri5a/decode_callhome_train/scoring/13.tra') + for line in file1: + lineComp = line.split() + if lineComp[0] == timeDetail: + return " ".join(lineComp[1:]) + for line in file2: + lineComp = line.split() + if lineComp[0] == timeDetail: + return " ".join(lineComp[1:]) + # No result found + return -1 + + +wordsFile = open('exp/tri5a/graph/words.txt') +words = {} + +# Extract word list +for line in wordsFile: + lineComp = line.split() + words[int(lineComp[1])] = lineComp[0].strip() + +# Now read list of files in conversations +fileList = [] +#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev') +conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/train') +for line in conversationList: + line = line.strip() + line = line[:-4] + fileList.append(line) + +# IN what order were the conversations added to the spanish files? +# TODO: Make sure they match the order in which these english files are being written + +# Now get timing information to concatenate the ASR outputs +if not os.path.exists('exp/tri5a/one-best/ch_train'): + os.makedirs('exp/tri5a/one-best/ch_train') + +#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+') +provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/asr.train', 'w+') +for item in fileList: + timingFile = open('/export/a04/gkumar/corpora/fishcall/callhome/tim/' + item + '.es') + newFile = open('exp/tri5a/one-best/ch_train/' + item + '.es', 'w+') + for line in timingFile: + timeInfo = line.split() + mergedTranslation = "" + for timeDetail in timeInfo: + #Locate this in ASR dev/test, this is going to be very slow + tmp = findTranscription(timeDetail) + if tmp != -1: + mergedTranslation = mergedTranslation + " " + tmp + mergedTranslation = mergedTranslation.strip() + transWords = [words[int(x)] for x in mergedTranslation.split()] + newFile.write(" ".join(transWords) + "\n") + provFile.write(" ".join(transWords) + "\n") + + newFile.close() +provFile.close() + + + + + + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py new file mode 100755 index 00000000000..4c96e01ce7e --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python + +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# Extracts one best output for a set of files +# The list of files in the conversations for which 1 best output has to be extracted +# words.txt + +from __future__ import print_function +import os +import sys +import subprocess + +latticeLocation = 'latjosh-2-callhome/lattices-pushed/' + +tmpdir = 'data/local/data/tmp/ch-d/lattmp' +invalidplfdir = 'data/local/data/tmp/ch-d/invalidplf' +symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt' + +conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/dev') +provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/asr.test.plf', 'w+') +invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/invalidPLF', 'w+') +blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/blankPLF', 'w+') +rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/removeLines', 'w+') + +if not os.path.exists(tmpdir): + os.makedirs(tmpdir) +if not os.path.exists(invalidplfdir): + os.makedirs(invalidplfdir) +else: + os.system("rm " + invalidplfdir + "/*") + +def latticeConcatenate(lat1, lat2): + ''' + Concatenates lattices, writes temporary results to tmpdir + ''' + if lat1 == "": + os.system('rm ' + tmpdir + '/tmp.lat') + return lat2 + else: + proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')]) + proc.wait() + return tmpdir + '/tmp.lat' + + +def findLattice(timeDetail): + ''' + Finds the lattice corresponding to a time segment + ''' + if os.path.isfile(latticeLocation + timeDetail + '.lat'): + return latticeLocation + timeDetail + '.lat' + else: + return -1 + + +# Now read list of files in conversations +fileList = [] +for line in conversationList: + line = line.strip() + line = line[:-4] + fileList.append(line) + +# IN what order were the conversations added to the spanish files? +# Now get timing information to concatenate the ASR outputs + +lineNo = 1 +for item in fileList: + timingFile = open('/export/a04/gkumar/corpora/fishcall/callhome/tim/' + item + '.es') + for line in timingFile: + timeInfo = line.split() + + # For utterances that are concatenated in the translation file, + # the corresponding FSTs have to be translated as well + mergedTranslation = "" + for timeDetail in timeInfo: + tmp = findLattice(timeDetail) + if tmp != -1: + # Concatenate lattices + mergedTranslation = latticeConcatenate(mergedTranslation, tmp) + + print(mergedTranslation) + if mergedTranslation != "": + + # Sanjeev's Recipe : Remove epsilons and topo sort + finalFST = tmpdir + "/final.fst" + os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST) + + # Now convert to PLF + proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable + ' ' + finalFST, stdout=subprocess.PIPE, shell=True) + PLFline = proc.stdout.readline() + finalPLFFile = tmpdir + "/final.plf" + finalPLF = open(finalPLFFile, "w+") + finalPLF.write(PLFline) + finalPLF.close() + + # now check if this is a valid PLF, if not write it's ID in a + # file so it can be checked later + proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) + line = proc.stdout.readline() + print("{} {}".format(line, lineNo)) + if line.strip() != "PLF format appears to be correct.": + os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) + invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") + rmLines.write("{}\n".format(lineNo)) + else: + provFile.write(PLFline) + else: + blankPLF.write(timeInfo[0] + "\n") + rmLines.write("{}\n".format(lineNo)) + # Now convert to PLF + lineNo += 1 + +provFile.close() +invalidPLF.close() +blankPLF.close() +rmLines.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh new file mode 100755 index 00000000000..d06e5fe911f --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# Gets the unique speakers from the file created by fsp_make_trans.pl +# Note that if a speaker appears multiple times, it is categorized as female + +import os +import sys + +tmpFileLocation = 'data/local/tmp/callhome_spk2gendertmp' + +tmpFile = None + +try: + tmpFile = open(tmpFileLocation) +except IOError: + print 'The file spk2gendertmp does not exist. Run fsp_make_trans.pl first?' + +speakers = {} + +for line in tmpFile: + comp = line.split(' ') + if comp[0] in speakers: + speakers[comp[0]] = "f" + else: + speakers[comp[0]] = comp[1] + +for speaker, gender in speakers.iteritems(): + print speaker + " " + gender diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl new file mode 100755 index 00000000000..ec3dfd88037 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl @@ -0,0 +1,74 @@ +#!/usr/bin/env perl +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +use utf8; +use File::Basename; + +($tmpdir)=@ARGV; +$trans="$tmpdir/callhome_train_transcripts.flist"; +$reco="$tmpdir/callhome_reco2file_and_channel"; +open(T, "<", "$trans") || die "Can't open transcripts file"; +open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!"; +open(O, ">$tmpdir/callhome.text.1") || die "Can't open text file for writing"; +open(G, ">$tmpdir/callhome_spk2gendertmp") || die "Can't open the speaker to gender map file"; +binmode(O, ":utf8"); +while () { + $file = $_; + m:([^/]+)\.txt: || die "Bad filename $_"; + $call_id = $1; + print R "$call_id-A $call_id A\n"; + print R "$call_id-B $call_id B\n"; + open(I, "<$file") || die "Opening file $_"; + binmode(I, ":iso88591"); + #Now read each line and extract information + while () { + #136.37 138.10 B: Ah, bueno, mamita. + chomp; + + my @stringComponents = split(":", $_, 2); + my @timeInfo = split(" ", $stringComponents[0]); + $stringComponents[1] =~ s/^\s+|\s+$//g ; + my $words = $stringComponents[1]; + #Check number of components in this array + if ((scalar @stringComponents) >= 2) { + $start = sprintf("%06d", $timeInfo[0] * 100); + $end = sprintf("%06d", $timeInfo[1] * 100); + length($end) > 6 && die "Time too long $end in $file"; + $side = "A"; + if (index($timeInfo[2], "B") != -1) { + $side = "B"; + } + $utt_id = "${call_id}-$side-$start-$end"; + $speaker_id = "${call_id}-$side"; + # All speakers are treated as male because speaker gender info + # is missing in this file + $gender = "m"; + print G "$speaker_id $gender\n" || die "Error writing to speaker2gender file"; + $words =~ s|\[\[[^]]*\]\]||g; #removes comments + $words =~ s|\{laugh\}|\$laughter\$|g; # replaces laughter tmp + $words =~ s|\[laugh\]|\$laughter\$|g; # replaces laughter tmp + $words =~ s|\{[^}]*\}|\[noise\]|g; # replaces noise + $words =~ s|\[[^]]*\]|\[noise\]|g; # replaces noise + $words =~ s|\[/*([^]]*)\]|\[noise\]|g; # replaces end of noise + $words =~ s|\$laughter\$|\[laughter\]|g; # replaces laughter again + $words =~ s|\(\(([^)]*)\)\)|\1|g; # replaces unintelligible speech + $words =~ s|<\?([^>]*)>|\1|g; # for unrecognized language + $words =~ s|background speech|\[noise\]|g; + $words =~ s|background noise|\[noise\]|g; + $words =~ s/\[/larrow/g; + $words =~ s/\]/rarrow/g; + $words =~ s/[[:punct:]]//g; + $words =~ s/larrow/\[/g; + $words =~ s/rarrow/\]/g; + $words =~ s/[¿¡]//g; + $words =~ s/\h+/ /g; # horizontal whitespace characters + $words = lc($words); + print O "$utt_id $words\n" || die "Error writing to text file"; + } + } + close(I); +} +close(T); +close(R); +close(O); +close(G); diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh new file mode 100755 index 00000000000..37e1eca1687 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +if [ $# -gt 0 ]; then + sentence=$1 + echo $sentence | sed 's:{^[}]*}:[noise]:' +fi + + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh new file mode 100755 index 00000000000..c487f1bd222 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh @@ -0,0 +1,288 @@ +#!/bin/bash + +# 1g is like 1f but upgrading to a "resnet-style TDNN-F model", i.e. +# with bypass resnet connections, and re-tuned. +# compute-wer --text --mode=present ark:exp/chain/multipsplice_tdnn/decode_fsp_train_test/scoring_kaldi/test_filt.txt ark,p:- +# %WER 22.21 [ 8847 / 39831, 1965 ins, 2127 del, 4755 sub ] +# %SER 56.98 [ 3577 / 6278 ] +# Scored 6278 sentences, 0 not present in hyp. + +# steps/info/chain_dir_info.pl exp/chain/multipsplice_tdnn +# exp/chain/multipsplice_tdnn: num-iters=296 nj=1..2 num-params=8.2M dim=40+100->2489 combine=-0.170->-0.165 (over 8) xent:train/valid[196,295,final]=(-2.30,-1.93,-1.83/-2.24,-1.96,-1.86) logprob:train/valid[196,295,final]=(-0.208,-0.169,-0.164/-0.189,-0.161,-0.158) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train +test_sets="test dev" +gmm=tri5a # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1g #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.3@0.50,0' + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 17 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 18 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 19 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 20 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand $srand \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.frames-per-iter 5000000 \ + --trainer.optimization.num-jobs-initial 1 \ + --trainer.optimization.num-jobs-final=2 \ + --trainer.optimization.initial-effective-lrate 0.0005 \ + --trainer.optimization.final-effective-lrate 0.00005 \ + --trainer.num-chunk-per-minibatch 128,64 \ + --trainer.optimization.momentum 0.0 \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context 0 \ + --egs.chunk-right-context 0 \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --cleanup.remove-egs $remove_egs \ + --use-gpu true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir exp/tri5a_lats_nodup_sp \ + --dir $dir || exit 1; +fi + +if [ $stage -le 21 ]; then + # The reason we are using data/lang_test here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + #LM was trained only on Fisher Spanish train subset. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph_fsp_train || exit 1; + +fi + +rnnlmdir=exp/rnnlm_lstm_tdnn_1b +if [ $stage -le 22 ]; then + local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; +fi + +if [ $stage -le 23 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l " + exit 1; +fi + +txtdir=$1 +textdir=$(realpath $txtdir) +outdir=$(realpath $2) + +workdir=$outdir/tmp +if [ $stage -le 0 ]; then + rm -rf $outdir + mkdir -p $workdir + mkdir -p $textdir/splits + mkdir -p $outdir/data + split -l 1000000 $textdir/in.txt $textdir/splits/out + numsplits=0 + for x in $textdir/splits/*; do + numsplits=$((numsplits+1)) + ln -s $x $outdir/data/$numsplits + done + echo $numsplits + cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt . + $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \ + local/run_norm.sh \ + sparrowhawk_configuration.ascii_proto \ + $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \ + $outdir/data \ + JOB \ + $outdir/sparrowhawk/ + cat $outdir/sparrowhawk/*.txt | sed "/^$/d" > $outdir/text_normalized + + # check if numbers are there in normalized output + awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \ + $outdir/text_normalized > $outdir/unique_words + grep "[0-9]" $outdir/unique_words | sort -u > $outdir/numbers +fi diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh new file mode 100755 index 00000000000..d48a96db5c4 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +# No sanity checks here, they need to be added + +data=data/callhome_test +dir=exp/tri5a/decode_callhome_test +lang=data/lang +LMWT=13 + +[ -f ./path.sh ] && . ./path.sh + +cmd=run.pl +filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel" +name=`basename $data`; +model=$dir/../final.mdl # assume model one level up from decoding dir. +symTable=$lang/words.txt + +if [ ! -f $dir/oracle/oracle.lat.gz ]; then + cat $data/text | utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \ + lattice-oracle --write-lattices="ark:|gzip -c > $dir/oracle/oracle.lat.gz" \ + "ark:gunzip -c $dir/lat.*.gz|" ark:- ark:- > /dev/null 2>&1 +fi + +lattice-align-words $lang/phones/word_boundary.int $model \ + "ark:gunzip -c $dir/oracle/oracle.lat.gz|" ark:- | \ + lattice-1best --lm-scale=$LMWT ark:- ark:- | nbest-to-ctm ark:- - | \ + utils/int2sym.pl -f 5 $lang/words.txt | \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ + > $dir/oracle/$name.ctm diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh new file mode 100755 index 00000000000..8a60dc9d422 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +data_dir=data +train_all=data/train_all + +if [ $# -lt 1 ]; then + echo "Specify the location of the split files" + exit 1; +fi + +splitFile=$1 + +# Train first +for split in train dev test dev2 +do + + cp -r $train_all $data_dir/$split + + awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \ + $splitFile/$split $train_all/segments > $data_dir/$split/segments + + n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $data_dir/$split/segments | sort | uniq | wc -l` + + echo "$n conversations left in split $split" + + utils/fix_data_dir.sh $data_dir/$split + utils/validate_data_dir.sh $data_dir/$split +done + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh new file mode 100755 index 00000000000..7d09f574580 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +. ./cmd.sh + +split=test +data_dir=data/test +decode_dir=exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it4/ +lang_dir=data/lang + +# Create the STM file +# Always create this file before creating the CTM files so that +# channel numbers are properly created. +if [ ! -f $data_dir/stm ]; then + /export/a11/guoguo/babel/103-bengali-limitedLP.official/local/prepare_stm.pl $data_dir +fi + +# Create the CTM file +steps/get_ctm.sh $data_dir $lang_dir $decode_dir + +# Make sure that channel markers match +#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm +#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {} +#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {} + +# Get the environment variables +. /export/babel/data/software/env.sh + +# Start scoring +/export/a11/guoguo/babel/103-bengali-limitedLP.official/local/score_stm.sh $data_dir $lang_dir \ + $decode_dir + +# Print a summary of the result +grep "Percent Total Error" $decode_dir/score_*/$split.ctm.dtl diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py b/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py new file mode 100755 index 00000000000..6f3d3f80c95 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +# Author : Gaurav Kumar (Johns Hopkins University) +# Gets a report on what the best word error rate was and which iteration +# led to it. This is needed both for reporting purposes and for setting +# the acoustic scale weight which extracting lattices. +# This script is specific to my partitions and needs to be made more general +# or modified + +from __future__ import print_function +import subprocess +import os + +decode_directories = ['exp/tri5a/decode_dev', + 'exp/tri5a/decode_test', + 'exp/tri5a/decode_dev2', + 'exp/sgmm2x_6a/decode_dev_fmllr', + 'exp/sgmm2x_6a/decode_test_fmllr', + 'exp/sgmm2x_6a/decode_dev2_fmllr', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it1', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it2', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it3', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it4', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it1', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it2', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it3', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it4', + 'exp/sgmm2x_6a_mmi_b0.2/decode_test_it1', + 'exp/sgmm2x_6a_mmi_b0.2/decode_test_it2', + 'exp/sgmm2x_6a_mmi_b0.2/decode_test_it3', + 'exp/sgmm2x_6a_mmi_b0.2/decode_test_it4', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it1', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it2', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it3', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it4', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it1', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it2', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it3', + 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it4', + 'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it1', + 'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it2', + 'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it3', + 'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it4' + ] + +def get_best_wer(decode_dir): + best_iteration = 0 + best_wer = 100.0 + for i in range(16): + if os.path.isfile("{}/wer_{}".format(decode_dir, i)): + result = subprocess.check_output("tail -n 3 {}/wer_{}".format(decode_dir, i), shell=True) + wer_string = result.split("\n")[0] + wer_details = wer_string.split(' ') + # Get max WER + wer = float(wer_details[1]) + if wer < best_wer: + best_wer = wer + best_iteration = i + return best_iteration, best_wer + +for decode_dir in decode_directories[:6]: + print(decode_dir) + print(get_best_wer(decode_dir)) + +# Separate processing for bMMI stuff +best_wer = 100.0 +best_dir = "" +best_iteration = 0 + +for decode_dir in decode_directories[6:10]: + iteration, wer = get_best_wer(decode_dir) + if wer < best_wer: + best_wer = wer + best_dir = decode_dir + best_iteration = iteration + +print(best_dir) +print((best_iteration, best_wer)) + +best_wer = 100.0 +best_dir = "" +best_iteration = 0 + +for decode_dir in decode_directories[10:14]: + iteration, wer = get_best_wer(decode_dir) + if wer < best_wer: + best_wer = wer + best_dir = decode_dir + best_iteration = iteration + +print(best_dir) +print((best_iteration, best_wer)) + +best_wer = 100.0 +best_dir = "" +best_iteration = 0 + +for decode_dir in decode_directories[14:18]: + iteration, wer = get_best_wer(decode_dir) + if wer < best_wer: + best_wer = wer + best_dir = decode_dir + best_iteration = iteration + +print(best_dir) +print((best_iteration, best_wer)) + +best_wer = 100.0 +best_dir = "" +best_iteration = 0 + +for decode_dir in decode_directories[18:22]: + iteration, wer = get_best_wer(decode_dir) + if wer <= best_wer: + best_wer = wer + best_dir = decode_dir + best_iteration = iteration + +print(best_dir) +print((best_iteration, best_wer)) + +best_wer = 100.0 +best_dir = "" +best_iteration = 0 + +for decode_dir in decode_directories[22:26]: + iteration, wer = get_best_wer(decode_dir) + if wer <= best_wer: + best_wer = wer + best_dir = decode_dir + best_iteration = iteration + +print(best_dir) +print((best_iteration, best_wer)) + +best_wer = 100.0 +best_dir = "" +best_iteration = 0 + +for decode_dir in decode_directories[26:]: + iteration, wer = get_best_wer(decode_dir) + if wer <= best_wer: + best_wer = wer + best_dir = decode_dir + best_iteration = iteration + +print(best_dir) +print((best_iteration, best_wer)) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl new file mode 100755 index 00000000000..2da41182d20 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl @@ -0,0 +1,25 @@ +#!/usr/bin/env perl +#Finds unique phones from the basic rules file +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +use utf8; + +($b)=$ARGV[0]; +($tmpdir)=$ARGV[1]; +open(BB, "<", "$b/basic_rules") || die "Can't open basic rules"; +binmode(BB, ":iso88591"); +open(O, ">$tmpdir/phones") || die "Can't open text file for writing"; +binmode(O, ":utf8"); +my %phones = qw(); +while () { + chomp; + my @stringComponents = split(/\t/); + m/->\s(\S+)/; + my $phone = $1; + $phone =~ tr/áéíóú/aeiou/; + $phones{$phone} = 1; +} +foreach my $p (keys %phones) { + print O $p, "\n"; +} +#print keys %phones; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh new file mode 100755 index 00000000000..20220d107bc --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +# Fixes the CALLHOME stm files +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +data_dir=$1 + +cat $data_dir/stm | awk '{$1=substr(tolower($1),0,length($1)-4);print;}' > $data_dir/stm_new +mv $data_dir/stm $data_dir/stm.bak +mv $data_dir/stm_new $data_dir/stm diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh new file mode 100755 index 00000000000..242359e7c28 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -e + +# Path to Gigaword corpus with all data files decompressed. +export GIGAWORDDIR=$1 +# The directory to write output to +export OUTPUTDIR=$2 +# The number of jobs to run at once +export NUMJOBS=$3 + +echo "Flattening Gigaword with ${NUMJOBS} processes..." +mkdir -p $OUTPUTDIR +find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \; +echo "Combining the flattened files into one..." +cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py new file mode 100644 index 00000000000..29f6766dd84 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +import logging +import os +import re +import spacy +import gzip + +from argparse import ArgumentParser +from bs4 import BeautifulSoup + +en_nlp = spacy.load("es") + + +def flatten_one_gigaword_file(file_path): + f = gzip.open(file_path) + html = f.read() + # Parse the text with BeautifulSoup + soup = BeautifulSoup(html, "html.parser") + + # Iterate over all

items and get the text for each. + all_paragraphs = [] + for paragraph in soup("p"): + # Turn inter-paragraph newlines into spaces + paragraph = paragraph.get_text() + paragraph = re.sub(r"\n+", "\n", paragraph) + paragraph = paragraph.replace("\n", " ") + # Tokenize the paragraph into words + tokens = en_nlp.tokenizer(paragraph) + words = [str(token) for token in tokens if not + str(token).isspace()] + if len(words) < 3: + continue + all_paragraphs.append(words) + # Return a list of strings, where each string is a + # space-tokenized paragraph. + return [" ".join(paragraph) for paragraph in all_paragraphs] + + +if __name__ == "__main__": + log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + logging.basicConfig(level=logging.INFO, format=log_fmt) + logger = logging.getLogger(__name__) + + parser = ArgumentParser(description=("Flatten a gigaword data file for " + "use in language modeling.")) + parser.add_argument("--gigaword-path", required=True, + metavar="", type=str, + help=("Path to Gigaword directory, with " + "all .gz files unzipped.")) + parser.add_argument("--output-dir", required=True, metavar="", + type=str, help=("Directory to write final flattened " + "Gigaword file.")) + + A = parser.parse_args() + all_paragraphs = flatten_one_gigaword_file(A.gigaword_path) + output_path = os.path.join(A.output_dir, + os.path.basename(A.gigaword_path) + ".flat") + with open(output_path, "w") as output_file: + for paragraph in all_paragraphs: + output_file.write("{}\n".format(paragraph)) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh new file mode 100755 index 00000000000..6b236be0ab9 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -e + +. ./path_venv.sh + +# Path to Gigaword corpus with all data files decompressed. +GIGAWORDPATH=$1 +# The directory to write output to +OUTPUTDIR=$2 +file=$(basename ${GIGAWORDPATH}) +if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then + echo "flattening to ${OUTPUTDIR}/${file}.flat" + python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR} +else + echo "skipping ${file}.flat" +fi + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh new file mode 100755 index 00000000000..fb765b57e69 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# + +if [ -f path.sh ]; then . ./path.sh; fi + +mkdir -p data/lang_test + +arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz +[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; + +mkdir -p data/lang_test +cp -r data/lang/* data/lang_test + +gunzip -c "$arpa_lm" | \ + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst + + +echo "Checking how stochastic G is (the first of these numbers should be small):" +fstisstochastic data/lang_test/G.fst + +## Check lexicon. +## just have a look and make sure it seems sane. +echo "First few lines of lexicon FST:" +fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head + +echo Performing further checks + +# Checking that G.fst is determinizable. +fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. + +# Checking that L_disambig.fst is determinizable. +fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. + +# Checking that disambiguated lexicon times G is determinizable +# Note: we do this with fstdeterminizestar not fstdeterminize, as +# fstdeterminize was taking forever (presumbaly relates to a bug +# in this version of OpenFst that makes determinization slow for +# some case). +fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ + fstdeterminizestar >/dev/null || echo Error + +# Checking that LG is stochastic: +fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ + fstisstochastic || echo "[log:] LG is not stochastic" + + +echo "$0 succeeded" diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh new file mode 100755 index 00000000000..11d65da3e95 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh @@ -0,0 +1,175 @@ +#!/bin/bash +# +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) +# In addition the transcripts are needed as well. +# To be run from one directory above this script. + +# Note: when creating your own data preparation scripts, it's a good idea +# to make sure that the speaker id (if present) is a prefix of the utterance +# id, that the output scp file is sorted on utterance id, and that the +# transcription file is exactly the same length as the scp file and is also +# sorted on utterance id (missing transcriptions should be removed from the +# scp file using e.g. scripts/filter_scp.pl) + +stage=0 + +export LC_ALL=C + + +if [ $# -lt 2 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04" + exit 1; +fi + +cdir=`pwd` +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils +tmpdir=`pwd`/data/local/tmp +mkdir -p $tmpdir + +. ./path.sh || exit 1; # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi +cd $dir + +# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command +# line arguments being absolute pathnames. +rm -r links/ 2>/dev/null +mkdir links/ +ln -s $* links + +# Basic spot checks to see if we got the data that we needed +if [ ! -d links/LDC2010S01 -o ! -d links/LDC2010T04 ]; +then + echo "The speech and the data directories need to be named LDC2010S01 and LDC2010T04 respecti +vely" + exit 1; +fi + +#if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ]; +if [ ! -d links/LDC2010S01/data/speech ]; +then + echo "Speech directories missing or not properly organised within the speech data dir" + echo "Typical format is LDC2010S01/data/speech" + exit 1; +fi + +#Check the transcripts directories as well to see if they exist +if [ ! -d links/LDC2010T04/fisher_spa_tr/data/transcripts ]; +then + echo "Transcript directories missing or not properly organised" + echo "Typical format is LDC2010T04/fisher_spa_tr/data/transcripts" + exit 1; +fi + +#speech_d1=$dir/links/LDC2010S01/DISC1/data/speech +#speech_d2=$dir/links/LDC2010S01/DISC2/data/speech +speech=$dir/links/LDC2010S01/data/speech +transcripts=$dir/links/LDC2010T04/fisher_spa_tr/data/transcripts + +#fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` +#fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` +fcount_s=`find ${speech} -iname '*.sph' | wc -l` +fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` +#TODO:it seems like not all speech files have transcripts +#Now check if we got all the files that we needed +#if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; +if [ $fcount_s != 819 -o $fcount_t != 819 ]; +then + echo "Incorrect number of files in the data directories" + echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively (Total = 819)" + echo "The transcripts should contain 819 files" + exit 1; +fi + +if [ $stage -le 0 ]; then + #Gather all the speech files together to create a file list + #TODO: Train and test split might be required + ( + #find $speech_d1 -iname '*.sph'; + #find $speech_d2 -iname '*.sph'; + find $speech -iname '*.sph'; + ) > $tmpdir/train_sph.flist + + #Get all the transcripts in one place + find $transcripts -iname '*.tdf' > $tmpdir/train_transcripts.flist +fi + +if [ $stage -le 1 ]; then + $local/fsp_make_trans.pl $tmpdir + mkdir -p $dir/train_all + mv $tmpdir/reco2file_and_channel $dir/train_all/ +fi + +if [ $stage -le 2 ]; then + sort $tmpdir/text.1 | grep -v '((' | \ + awk '{if (NF > 1){ print; }}' | \ + sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \ + sed 's:\([^<]*\)<\/lname>:\1:g' | \ + sed 's:::g' | \ + sed 's:[^<]*<\/laugh>:[laughter]:g' | \ + sed 's:<\s*cough[\/]*>:[noise]:g' | \ + sed 's::[noise]:g' | \ + sed 's::[noise]:g' | \ + sed 's::[noise]:g' | \ + sed 's:[^<]*<\/background>:[noise]:g' | \ + sed -r 's:<[/]?background[/]?>:[noise]:g' | \ + #One more time to take care of nested stuff + sed 's:[^<]*<\/laugh>:[laughter]:g' | \ + sed -r 's:<[/]?laugh[/]?>:[laughter]:g' | \ + #now handle the exceptions, find a cleaner way to do this? + sed 's:::g' | \ + sed 's:::g' | \ + sed 's:foreign>::g' | \ + sed 's:>::g' | \ + #How do you handle numbers? + grep -v '()' | \ + #Now go after the non-printable characters and multiple spaces + sed -r 's:¿::g' | sed 's/^\s\s*|\s\s*$//g' | sed 's/\s\s*/ /g' > $tmpdir/text.2 + cp $tmpdir/text.2 $dir/train_all/text + + #Create segments file and utt2spk file + ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \ + && echo "Error producing utt2spk file" && exit 1; + + cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; + $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); if ($s != $e) {print "$utt $reco $s $e\n"}; ' >$dir/train_all/segments + + $utils/utt2spk_to_spk2utt.pl <$dir/train_all/utt2spk > $dir/train_all/spk2utt +fi + +if [ $stage -le 3 ]; then + for f in `cat $tmpdir/train_sph.flist`; do + # convert to absolute path + make_absolute.sh $f + done > $tmpdir/train_sph_abs.flist + + cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp + cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \ + sort -k1,1 -u > $dir/train_all/wav.scp || exit 1; +fi + +if [ $stage -le 4 ]; then + # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl. + cd $cdir + $local/fsp_make_spk2gender.sh > $dir/train_all/spk2gender +fi + +fix_data_dir.sh $dir/train_all || exit 1 +validate_data_dir.sh --no-feats $dir/train_all || exit 1 + +echo "Fisher Spanish Data preparation succeeded." + +exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl new file mode 100755 index 00000000000..538bca58981 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl @@ -0,0 +1,85 @@ +#!/usr/bin/env perl +# +# Johns Hopkins University (Author : Gaurav Kumar) +# +# This script should be run from one directory above the current one +# +# Rough partitions that are needed are : +# +# ASR Train : 120k utterances +# ASR tune : 20k utterances +# ASR eval : 20k utterances +# MT train : 105k utterances +# MT tune : Same as the ASR eval (20k utterances) +# MT eval : 20k utterances +# +# This script tries to find the closest possible matches so that conversations +# belong in one single partition and hence there is no speaker/conversation +# overlap between data partitions + +use Storable 'dclone'; + +$textfile="data/local/data/train_all/text"; +$tmp="data/local/tmp"; + +open(T, "<", "$textfile") || die "Can't open text file"; + +$ongoingConv = ""; +%tmpSplits = (); +@splitNumbers = (17455, 20000, 100000, 20000, 100000); +$splitId = 0; +%splits = (); + +while () { + @myStringComponents = split(/\s/); + @uttid = split('-', $myStringComponents[0]); + $currentConv = $uttid[0]; + if ($currentConv eq $ongoingConv) { + # Same conversation, add to current hash + #print "Same conversation"; + $tmpSplits{$ongoingConv} += 1; + } + else { + # New conversation intiated, first check if there are enough entries + # in the hash + #print $ongoingConv . " " . get_entries_hash(\%tmpSplits) . "\n"; + if (get_entries_hash(\%tmpSplits) > $splitNumbers[$splitId]) { + print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n"; + #$splits{$splitId} = keys %tmpSplits; + @newArr = keys %tmpSplits; + $splits{$splitId} = dclone(\@newArr); + %tmpSplits = (); + $splitId += 1; + } + $ongoingConv = $currentConv; + $tmpSplits{$ongoingConv} = 1; + } +} +# Put final tmpsplits in the right partition +@newArr = keys %tmpSplits; +$splits{$splitId} = dclone(\@newArr); +foreach (keys %splits) { + #print $_ , " ", $splits{$_}, "\n"; +} +print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n"; + +# Write splits to file +foreach my $key ( keys %splits ) { + open(S, ">$tmp/split-$key") || die "Can't open splitfile to write"; + foreach my $file ( @{$splits{$key}} ) { + print $file, "\n"; + print S "$file\n" || die "Error writing to file"; + } + close(S); +} + +sub get_entries_hash() { + my $inputHashRef = shift; + $total = 0; + foreach (keys %{$inputHashRef}) + { + $total += $inputHashRef->{$_}; + } + return $total; +} + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh new file mode 100755 index 00000000000..15b1c0064cf --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# Gets the unique speakers from the file created by fsp_make_trans.pl +# Note that if a speaker appears multiple times, it is categorized as female + +import os +import sys + +tmpFileLocation = 'data/local/tmp/spk2gendertmp' + +tmpFile = None + +try: + tmpFile = open(tmpFileLocation) +except IOError: + print 'The file spk2gendertmp does not exist. Run fsp_make_trans.pl first?' + +speakers = {} + +for line in tmpFile: + comp = line.split(' ') + if comp[0] in speakers: + speakers[comp[0]] = "f" + else: + speakers[comp[0]] = comp[1] + +for speaker, gender in speakers.iteritems(): + print speaker + " " + gender diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl new file mode 100755 index 00000000000..8c3f74e3917 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl @@ -0,0 +1,81 @@ +#!/usr/bin/env perl +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +use utf8; +use File::Basename; +($tmpdir)=@ARGV; +#$tmpdir='../data/local/tmp'; +$trans="$tmpdir/train_transcripts.flist"; +$reco="$tmpdir/reco2file_and_channel"; +open(T, "<", "$trans") || die "Can't open transcripts file"; +open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!"; +open(O, ">$tmpdir/text.1") || die "Can't open text file for writing"; +open(G, ">$tmpdir/spk2gendertmp") || die "Can't open the speaker to gender map file"; +binmode(O, ":utf8"); +while () { + $file = $_; + m:([^/]+)\.tdf: || die "Bad filename $_"; + $call_id = $1; + print R "$call_id-A $call_id A\n"; + print R "$call_id-B $call_id B\n"; + open(I, "<$file") || die "Opening file $_"; + binmode(I, ":utf8"); + # Get rid of header sections first + foreach ( 0..2 ) { + $tmpLine = ; + } + #Now read each line and extract information + while () { + #20051017_215732_274_fsp.sph 1 0.0 0.909856781803 Audrey female native Audrey 0 0 -1 + chomp; + my @stringComponents = split(/\t/); + + #Check number of components in this array + if ((scalar @stringComponents) >= 11) { + $start = sprintf("%06d", $stringComponents[2] * 100); + $end = sprintf("%06d", $stringComponents[3] * 100); + length($end) > 6 && die "Time too long $end in $file"; + $side = $stringComponents[1] ? "B" : "A"; + $words = $stringComponents[7]; + $utt_id = "${call_id}-$side-$start-$end"; + $speaker_id = "${call_id}-$side"; + $gender = "m"; + if ($stringComponents[5] == "female") { + $gender = "f"; + } + print G "$speaker_id $gender\n" || die "Error writing to speaker2gender file"; + $words =~ s:/rarrow/g; + $words =~ s/[[:punct:]]//g; + $words =~ s/larrow//g; + $words =~ s:lendarrow: 0){ print; }}' > $tmpdir/uniquewords + if [ ! -f "${tmpdir}/es_wordlist.json" ]; then + echo "Could not find the large collection of Spanish words es_wordlist.json" + echo "Trying to download it via wget" + + if ! which wget >&/dev/null; then + echo "This script requires you to first install wget" + exit 1; + fi + + cwd=`pwd` + cd $tmpdir + wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz + + if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then + echo "Download of the large Spanish word list failed" + exit 1; + fi + + tar -xovzf es_wordlist.json.tgz || exit 1; + cd $cwd + fi + + # Merge with gigaword corpus + $local/merge_lexicons.py ${tmpdir} ${lexicon} + mv $tmpdir/uniquewords $tmpdir/uniquewords.small + mv $tmpdir/uniquewords64k $tmpdir/uniquewords +fi + +#Then get the list of phones form basic_rules in the lexicon folder +if [ $stage -le 1 ]; then + if [ ! -d "$lexicon/callhome_spanish_lexicon_970908" ]; then + echo "Could not find folder callhome_spanish_lexicon_970908 in the lexicon folder" + exit 1; + fi + + # This is a preliminary attempt to get the unique phones from the LDC lexicon + # This will be extended based on our lexicon later + perl $local/find_unique_phones.pl $lexicon/callhome_spanish_lexicon_970908 $tmpdir + +fi + +#Get pronunciation for each word using the spron.pl file in the lexicon folder +if [ $stage -le 2 ]; then + #cd $lexicon/callhome_spanish_lexicon_970908 + # Replace all words for which no pronunciation was generated with an orthographic + # representation + cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \ + | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \ + | awk -F '[/][/]' '{print $1}' \ + > $tmpdir/lexicon_raw +fi + +#Break the pronunciation down according to the format required by Kaldi +if [ $stage -le 3 ]; then + # Creates a KALDI compatible lexicon, and extends the phone list + perl $local/isolate_phones.pl $tmpdir + cat $tmpdir/phones_extended | sort | awk '{if ($1 != "") {print;}}' > $tmpdir/phones_extended.1 + mv $tmpdir/phones $tmpdir/phones.small + mv $tmpdir/phones_extended.1 $tmpdir/phones + sort $tmpdir/phones -o $tmpdir/phones + paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | sed -r 's:(\S+)\s#.*:\1 oov:g' > $tmpdir/lexicon.1 + #paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | grep -v '#' > $tmpdir/lexicon.1 +fi + +if [ $stage -le 4 ]; then + # silence phones, one per line. + for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt + echo sil > $dir/optional_silence.txt + + # An extra question will be added by including the silence phones in one class. + cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > \ + $dir/extra_questions.txt || exit 1; + + # Remove [] chars from phones + cat $tmpdir/phones | awk '{if ($1 != "_" && $1 != "[" && $1 != "]") {print;}}' > $tmpdir/phones.1 + rm $tmpdir/phones + mv $tmpdir/phones.1 $tmpdir/phones + cp $tmpdir/phones $dir/nonsilence_phones.txt + + if [ -f $tmpdir/lexicon.2 ]; then rm $tmpdir/lexicon.2; fi + cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2" + + # Add prons for laughter, noise, oov + for w in `grep -v sil $dir/silence_phones.txt`; do + sed -i "/\[$w\]/d" $tmpdir/lexicon.2 + done + + for w in `grep -v sil $dir/silence_phones.txt`; do + echo "[$w] $w" + done | cat - $tmpdir/lexicon.2 > $tmpdir/lexicon.3 || exit 1; + + cat $tmpdir/lexicon.3 \ + <( echo "mm m" + echo " oov" ) > $tmpdir/lexicon.4 + + # From the lexicon remove _ from the phonetic representation + cat $tmpdir/lexicon.4 | sed 's:\s_::g' > $tmpdir/lexicon.5 + + cp "$tmpdir/lexicon.5" $dir/lexicon.txt + + cat $datadir/text | \ + awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \ + sort -nr > $tmpdir/word_counts + + awk '{print $1}' $dir/lexicon.txt | \ + perl -e '($word_counts)=@ARGV; + open(W, "<$word_counts")||die "opening word-counts $word_counts"; + while() { chop; $seen{$_}=1; } + while() { + ($c,$w) = split; + if (!defined $seen{$w}) { print; } + } ' $tmpdir/word_counts > $tmpdir/oov_counts.txt + echo "*Highest-count OOVs are:" + head -n 20 $tmpdir/oov_counts.txt +fi + +$utils/validate_dict_dir.pl $dir +exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh new file mode 100755 index 00000000000..cebf3b222ab --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh @@ -0,0 +1,140 @@ +#!/bin/bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +# To be run from one level above this directory +# Generate the text for the LM training +tmp_dir=data/local/tmp +train_all=data/local/data/train_all + +if [ $# -lt 1 ]; then + echo "Specify the location of the split files" + exit 1; +fi + +splitFile=$1 +split=train +# Train only +if [ -d $tmp_dir/$split ]; then + rm -r $tmp_dir/$split +fi +cp -r $train_all $tmp_dir/$split + +awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \ +$splitFile/$split $train_all/segments > $tmp_dir/$split/segments + +n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $tmp_dir/$split/segments | sort | uniq | wc -l` + +echo "$n conversations left in split $split" + +utils/fix_data_dir.sh $tmp_dir/$split +# There is no feature file yet, use --no-feats switch +utils/validate_data_dir.sh --no-feats $tmp_dir/$split + +# Now use this training text + +text=$tmp_dir/train/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# This script takes no arguments. It assumes you have already run +# fisher_data_prep.sh and fisher_prepare_dict.sh +# It takes as input the files +#data/train_all/text +#data/local/dict/lexicon.txt + +dir=`pwd`/data/local/lm +mkdir -p $dir +export LC_ALL=C # You'll get errors about things being not sorted, if you +# have a different locale. +export PATH=$PATH:`pwd`/../../../tools/kaldi_lm +( # First make sure the kaldi_lm toolkit is installed. + cd ../../../tools || exit 1; + if [ -d kaldi_lm ]; then + echo Not installing the kaldi_lm toolkit since it is already there. + else + echo Downloading and installing the kaldi_lm tools + if [ ! -f kaldi_lm.tar.gz ]; then + wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; + fi + tar -xvzf kaldi_lm.tar.gz || exit 1; + cd kaldi_lm + make || exit 1; + echo Done making the kaldi_lm tools + fi +) || exit 1; + +mkdir -p $dir + + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +# note: we probably won't really make use of as there aren't any OOVs +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ + || exit 1; + +# note: ignore 1st field of train.txt, it's the utterance-id. +cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} + { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ + || exit 1; + +train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; + +# Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332 + +# note: output is +# data/local/lm/3gram-mincount/lm_unpruned.gz + + +exit 0 + +echo "Baseline" + +# From here is some commands to do a baseline with SRILM (assuming +# you have it installed). +heldout_sent=158126 # Don't change this if you want result to be comparable with + # kaldi_lm results +sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. +mkdir -p $sdir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/train + +cat $dir/word_map | awk '{print $1}' | cat - <(echo ""; echo "" ) > $sdir/wordlist + + +ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz +ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout + +# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for in closed-vocabulary LM +# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs +# 0 zeroprobs, logprob= -165170 ppl= 71.7609 ppl1= 123.258 + + +# Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above. +# Difference in WSJ must have been due to different treatment of . +ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout + +# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for in closed-vocabulary LM +# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs +# 0 zeroprobs, logprob= -164990 ppl= 71.4278 ppl1= 122.614 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py new file mode 100755 index 00000000000..9c590635562 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +# Extracts one best output for a set of files +# The list of files in the conversations for which 1 best output has to be extracted +# words.txt + +import os +import sys + +scoringFile = "exp/sgmm2x_6a_mmi_b0.2/decode_test_it4/scoring/10.tra" +wordsFile = open('exp/sgmm2x_6a/graph/words.txt') +conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/test') +oneBestTmp = 'exp/sgmm2x_6a_mmi_b0.2/one-best/asr-test' +provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.test', 'w+') +timLocation = '/export/a04/gkumar/corpora/fishcall/fisher/tim' + +def findTranscription(timeDetail): + file1 = open(scoringFile) + for line in file1: + lineComp = line.split() + if lineComp[0] == timeDetail: + return " ".join(lineComp[1:]) + # No result found + return -1 + +words = {} + +# Extract word list +for line in wordsFile: + lineComp = line.split() + words[int(lineComp[1])] = lineComp[0].strip() + +# Now read list of files in conversations +fileList = [] +for line in conversationList: + line = line.strip() + line = line[:-4] + fileList.append(line) + +# Now get timing information to concatenate the ASR outputs +if not os.path.exists(oneBestTmp): + os.makedirs(oneBestTmp) + +for item in fileList: + timingFile = open(timLocation + '/' + item + '.es') + newFile = open(oneBestTmp + '/' + item + '.es', 'w+') + for line in timingFile: + timeInfo = line.split() + mergedTranslation = "" + for timeDetail in timeInfo: + #Locate this in ASR dev/test, this is going to be very slow + tmp = findTranscription(timeDetail) + if tmp != -1: + mergedTranslation = mergedTranslation + " " + tmp + mergedTranslation = mergedTranslation.strip() + transWords = [words[int(x)] for x in mergedTranslation.split()] + newFile.write(" ".join(transWords) + "\n") + provFile.write(" ".join(transWords) + "\n") + + newFile.close() +provFile.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl new file mode 100755 index 00000000000..ca5b2a46f8e --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl @@ -0,0 +1,39 @@ +#!/usr/bin/env perl + +# Nagendra Kumar Goel + +# This takes two arguments: +# 1) Pocolm training output folder +# 2) rnnlm weights file name (for output) + +use POSIX; +use List::Util qw[min max]; + +if (@ARGV != 2) { + die "Usage: get_data_weights.pl \n"; +} + +$pdir = shift @ARGV; +$out = shift @ARGV; + +open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters"; +open(N, "<$pdir/names") || die "Could not open $pdir/names" ; +open(O, ">$out") || die "Could not open $out for writing" ; + +my %scores = (); + +while() { + @n = split(/\s/,$_); + $name = $n[1]; + $w =

; + @w = split(/\s/,$w); + $weight = $w[1]; + $scores{$name} = $weight; +} + +$min = min(values %scores); + +for(keys %scores) { + $weightout = POSIX::ceil($scores{$_} / $min); + print O "$_\t1\t$weightout\n"; +} diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py new file mode 100755 index 00000000000..5430c18bb5b --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +# Extracts one best output for a set of files +# The list of files in the conversations for which 1 best output has to be extracted +# words.txt + +from __future__ import print_function +import os +import sys +import subprocess + +latticeLocation = 'latjosh-bmmi/lattices-pushed/' + +tmpdir = 'data/local/data/tmp/bmmi-t/lattmp' +invalidplfdir = 'data/local/data/tmp/bmmi-t/invalidplf' +symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt' + +conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/test') +provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/asr.test.plf', 'w+') +invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/invalidPLF', 'w+') +blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/blankPLF', 'w+') +rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/removeLines', 'w+') + +if not os.path.exists(tmpdir): + os.makedirs(tmpdir) +if not os.path.exists(invalidplfdir): + os.makedirs(invalidplfdir) +else: + os.system("rm " + invalidplfdir + "/*") + +def latticeConcatenate(lat1, lat2): + ''' + Concatenates lattices, writes temporary results to tmpdir + ''' + if lat1 == "": + os.system('rm ' + tmpdir + '/tmp.lat') + return lat2 + else: + proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')]) + proc.wait() + return tmpdir + '/tmp.lat' + + +def findLattice(timeDetail): + ''' + Finds the lattice corresponding to a time segment + ''' + if os.path.isfile(latticeLocation + timeDetail + '.lat'): + return latticeLocation + timeDetail + '.lat' + else: + return -1 + + +# Now read list of files in conversations +fileList = [] +for line in conversationList: + line = line.strip() + line = line[:-4] + fileList.append(line) + +# IN what order were the conversations added to the spanish files? +# Now get timing information to concatenate the ASR outputs + +lineNo = 1 +for item in fileList: + timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es') + for line in timingFile: + timeInfo = line.split() + + # For utterances that are concatenated in the translation file, + # the corresponding FSTs have to be translated as well + mergedTranslation = "" + for timeDetail in timeInfo: + tmp = findLattice(timeDetail) + if tmp != -1: + # Concatenate lattices + mergedTranslation = latticeConcatenate(mergedTranslation, tmp) + + print(mergedTranslation) + if mergedTranslation != "": + + # Sanjeev's Recipe : Remove epsilons and topo sort + finalFST = tmpdir + "/final.fst" + os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST) + + # Now convert to PLF + proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable + ' ' + finalFST, stdout=subprocess.PIPE, shell=True) + PLFline = proc.stdout.readline() + finalPLFFile = tmpdir + "/final.plf" + finalPLF = open(finalPLFFile, "w+") + finalPLF.write(PLFline) + finalPLF.close() + + # now check if this is a valid PLF, if not write it's ID in a + # file so it can be checked later + proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) + line = proc.stdout.readline() + print("{} {}".format(line, lineNo)) + if line.strip() != "PLF format appears to be correct.": + os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) + invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") + rmLines.write("{}\n".format(lineNo)) + else: + provFile.write(PLFline) + else: + blankPLF.write(timeInfo[0] + "\n") + rmLines.write("{}\n".format(lineNo)) + # Now convert to PLF + lineNo += 1 + +provFile.close() +invalidPLF.close() +blankPLF.close() +rmLines.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh new file mode 100755 index 00000000000..451a7c529fb --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# Gets lattice oracles +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +if [ $# -lt 3 ]; then + echo "Specify lattice dir, symbol table and text file for partition" + exit 1; +fi + +latticeDir=$1 +textFile=$3 +symTable=$2 +oracleDir=$latticeDir/oracle + +echo $latticeDir +echo $oracleDir + +. ./path.sh + +if [ ! -f $textFile -o ! -f $symTable -o ! -d $latticeDir ]; then + echo "Required files not found" + exit 1; +fi + +mkdir -p $oracleDir + +cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | \ + utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \ + $KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log + +sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl new file mode 100755 index 00000000000..0366dcdacb0 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl @@ -0,0 +1,66 @@ +#!/usr/bin/env perl +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# Once the phonetic representation for words is generated by the LDC lexicon +# This script converts them into a KALDI compatible format +# In addition, it extends the list of phonemes to consider based on +# orthograhic representations of those words which do not have stressed vowels + +use utf8; + +($tmpdir)=$ARGV[0]; +open(L, "<", "$tmpdir/lexicon_raw") || die "Can't open raw lexicon"; +open(P, "<" , "$tmpdir/phones") || die "Can't open phone file"; +open(I, ">$tmpdir/lexicon_one_column") || die "Can't open text file for writing"; +open(E, ">$tmpdir/phones_extended") || die "Can't open ex-phone file for writing"; +binmode(P, ":utf8"); +binmode(L, ":utf8"); +binmode(I, ":utf8"); +binmode(E, ":utf8"); + +#Get all phones +my %phones = qw(); +while (

) { + chomp; + $phones{$_} = 1; +} + +print @phones; + +while () { + if (substr($_, 0, 1) eq "#") { + print I $_; + next; + } + $len = length; + $current = 0; + $splitWord = ""; + while ($current < $len) { + #First check for two char codes + $currentChar2 = substr($_, $current, 2); + $currentChar1 = substr($_, $current, 1); + if (exists($phones{$currentChar2})) { + $splitWord = $splitWord . " " . $currentChar2; + $current = $current + 2; + } + else { + # Check if this phone exists + if (!exists($phones{$currentChar1})) { + $phones{$currentChar1} = 1 + } + $splitWord = $splitWord . " " . $currentChar1; + $current = $current + 1; + } + } + $splitWord =~ s/^\s*(.*?)\s*$/$1/; + print I $splitWord, "\n"; +} + +# Now write the phones to the extended phone file +foreach my $key (keys %phones) { + print E $key, "\n"; +} + +close(L); +close(P); +close(I); +close(E); diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh new file mode 100755 index 00000000000..bbe0af5810c --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# Author : Gaurav Kumar, Johns Hopkins University +# Creates OpenFST lattices from Kaldi lattices +# This script needs to be run from one level above this directory + +. ./path.sh + +if [ $# -lt 3 ]; then + echo "Enter the latdir (where the lattices will be put), the decode dir containing lattices and the acoustic scale" + exit 1 +fi + +prunebeam=2 + +latdir=$1 +decode_dir=$2 +acoustic_scale=$3 +#latdir="latjosh-2-callhome" +#decode_dir=exp/tri5a/decode_$partition +#acoustic_scale=0.077 + +stage=0 + +if [ -d $decode_dir ] +then + # TODO:Add scaling factor for weights, how? + rawLatDir="lattices" + compiledLatDir="lattices-bin" + preplfLatDir="lattices-pushed" + + mkdir -p $latdir + mkdir -p $latdir/$rawLatDir + mkdir -p $latdir/$compiledLatDir + mkdir -p $latdir/$preplfLatDir + + for l in $decode_dir/lat.*.gz + do + ( + # Extract file name and unzip the file first + bname=${l##*/} + bname="$latdir/${bname%.gz}" + gunzip -c $l > "$bname.bin" + + if [ $stage -le 0 ]; then + + # Now copy into ark format + $KALDI_ROOT/src/latbin/lattice-copy ark:$bname.bin ark,t:- > "$bname.raw" + + # Prune lattices + $KALDI_ROOT/src/latbin/lattice-prune --acoustic-scale=$acoustic_scale --beam=$prunebeam ark:"$bname.raw" ark:"$bname.pruned" + + # Convert to an openfst compatible format + $KALDI_ROOT/src/latbin/lattice-to-fst --lm-scale=1.0 --acoustic-scale=$acoustic_scale ark:$bname.pruned ark,t:$bname.ark.fst + + fi + + if [ $stage -le 1 ]; then + fileName="" + fileLine=0 + + while read line; do + if [ $fileLine = 0 ]; then + fileName="$line" + fileLine=1 + continue + fi + if [ -z "$line" ]; then + fileLine=0 + continue + fi + # Replace laugh, unk, oov, noise with eps + echo "$line" | awk '{if ($3 == 2038 || $3 == 2039 || $3 == 2040) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat" + done < $bname.ark.fst + echo "Done isolating lattices" + fi + ) & + done + wait + rm $latdir/*.bin + rm $latdir/*.pruned + + + if [ $stage -le 2 ]; then + #Compile lattices + for l in $latdir/$rawLatDir/*.lat + do + ( + # Arc type needs to be log + bname=${l##*/} + fstcompile --arc_type=log $latdir/$rawLatDir/$bname $latdir/$compiledLatDir/$bname + ) & + done + wait + echo "Done compiling lattices." + fi + + if [ $stage -le 3 ]; then + #Sanjeev's Recipe for creating valid PLF compatible FSTs" + # Create a dummy FST with one state and no arcs first + echo 0 | fstcompile --arc_type=log - $latdir/$preplfLatDir/dummy.fst + # Push Lattice weights towards initial state + for l in $latdir/$compiledLatDir/*.lat + do + ( + bname=${l##*/} + fstrmepsilon $latdir/$compiledLatDir/$bname | \ + fstpush --push_weights --remove_total_weight - | \ + # Do not topo sort here, do it before converting into PLF + # Sanjeev's Recipe : Concatenate with dummy FST + fstconcat - $latdir/$preplfLatDir/dummy.fst | \ + fstreverse - | \ + fstrmepsilon - | \ + fstreverse - $latdir/$preplfLatDir/$bname + ) & + done + wait + # Let's take a moment to thank the dummy FST for playing its + # part in this process. However, it has to go now. + rm $latdir/$preplfLatDir/dummy.fst + echo "Done performing fst push (initial state)" + fi +else + echo "Complete training and decoding first" +fi diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py new file mode 100755 index 00000000000..94546dc44c3 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc., Avaaya + +# Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon +from __future__ import print_function +import sys +import re +import json +import codecs +import operator + +wordlimit = 64000 +tmpdir = sys.argv[1] +ldc_lexicon = sys.argv[2] +uw_fisher = tmpdir + "/uniquewords" +uw_gigaword = tmpdir + "/es_wordlist.json" +uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences" + +filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]') +merged_lexicon = [] +# All three lexicons are in different formats +# First add the data from lexicon_fisher (A) into the dictionary +fisher = codecs.open(uw_fisher, encoding='utf-8') +for line in fisher: + merged_lexicon.append(line.strip()) +fisher.close() + +print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon))) + +# Now add data from the LDC lexicon +ldc = codecs.open(uw_LDC, encoding='iso-8859-1') +for line in ldc: + entries = line.strip().split('\t') + if entries[0].lower() not in merged_lexicon: + merged_lexicon.append(entries[0].lower()) + +print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon))) + +# Finally add the gigaword data +gigaword = json.load(open(uw_gigaword)) +gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1))) + +for item in gigaword: + # We need a maximum of wordlimit words in the lexicon + if len(merged_lexicon) == wordlimit: + break + + if item[0].lower() not in merged_lexicon: + merged_lexicon.append(item[0].lower()) + +print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon))) + +# Now write the uniquewords to a file +lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+') +ltuples = sorted(merged_lexicon) + +for item in ltuples: + if not item==u'ñ' and not re.search(filtered_letters, item): + lf.write(item + "\n") + +lf.close() + +print("Finshed writing unique words") diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh new file mode 100755 index 00000000000..a95893f698a --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +currentJob=0 + +dir=/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/exp/sgmm2x_6a_denlats + +for f in $dir/.done.*; do + d=`echo ${f##*/} | awk 'BEGIN {FS="."} {print $3}'` + if [ $d -gt $currentJob ]; then + currentJob=$d + fi +done + +currentJob=$((currentJob+1)) + +echo Currently processing job : $currentJob + +for i in $(seq 210); do + job[$i]=$i +done + +dir=/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/exp/sgmm2x_6a_denlats/log/$currentJob/q + +for f in $dir/done.*; do + d=`echo ${f##*/} | awk 'BEGIN {FS="."} {print $3}'` + unset job[$d] +done + +echo sub-splits left : ${#job[@]} +echo ${job[@]} diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..cc9de4d26c5 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh @@ -0,0 +1,187 @@ +#!/bin/bash + +set -e -o pipefail + +# This script is called from scripts like local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It +# contains the common feature preparation and iVector-related parts of the +# script. See those scripts for examples of usage. + + +stage=7 +nj=30 +train_set=train # you might set this to e.g. train. +test_sets="test dev" +gmm=tri5a # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + +num_threads_ubm=32 +nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff in (e.g. + # in the tedlium recip it's _cleaned). + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 7 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 7." + exit 1 +fi + + +if [ $stage -le 8 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 9 ]; then + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done +fi + +if [ $stage -le 10 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + # train a diagonal UBM using a subset of about a quarter of the data + num_utts_total=$(wc -l in the history of a n-gram +# un-comment the following line +#limit_unk_history_opt="--limit-unk-history=true" + +for order in 3; do + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + lm_name="${num_word}_${order}" + min_counts='' + # Note: the following might be a more reasonable setting: + # min_counts='fisher=2 swbd1=1' + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \ + --min-counts=${min_counts} \ + --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \ + ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir} + + mkdir -p ${arpa_dir} + format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz + + # example of pruning. note: the threshold can be less than or more than one. + get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + for threshold in 1.0 2.0 4.0; do + pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm + prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3 + get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz + + done + + # example of pruning by size. + size=1000000 + pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm + prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes' + get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz + +done + +# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 ) + +# the following does does some self-testing, including +# that the computed derivatives are accurate. +# local/self_test.sh + +# perplexities from pocolm-estimated language models with pocolm's interpolation +# method from orders 3, 4, and 5 are: +# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689) +# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797) +# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181) + +# note, the perplexities from pocolm-estimated language models with SRILM's +# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh), +# 78.8449 and 75.2202 respectively. + +# note, the perplexities from SRILM-estimated language models with SRILM's +# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh), +# 78.9056 and 75.5528 respectively. diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py b/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py new file mode 100755 index 00000000000..5c68e1204b2 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +# Processes lattice oracles + +import os +import sys + +oracleDir = "exp/tri5a/decode_callhome_train/oracle" +wordsFile = open('exp/sgmm2x_6a/graph/words.txt') +conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/train') +oracleTmp = 'exp/tri5a/one-best/oracle-ch-train' +provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/oracle.train', 'w+') +timLocation = '/export/a04/gkumar/corpora/fishcall/callhome/tim' + +def findTranscription(timeDetail): + file1 = open(oracleDir + "/oracle.tra") + for line in file1: + lineComp = line.split() + if lineComp[0] == timeDetail: + return " ".join(lineComp[1:]) + # No result found + return -1 + +words = {} + +# Extract word list +for line in wordsFile: + lineComp = line.split() + words[int(lineComp[1])] = lineComp[0].strip() + +# Now read list of files in conversations +fileList = [] +for line in conversationList: + line = line.strip() + line = line[:-4] + fileList.append(line) + +# IN what order were the conversations added to the spanish files? +# TODO: Make sure they match the order in which these english files are being written + +# Now get timing information to concatenate the ASR outputs +if not os.path.exists(oracleTmp): + os.makedirs(oracleTmp) + +#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+') +for item in fileList: + timingFile = open(timLocation + '/' + item + '.es') + newFile = open(oracleTmp + '/' + item + '.es', 'w+') + for line in timingFile: + timeInfo = line.split() + mergedTranslation = "" + for timeDetail in timeInfo: + #Locate this in ASR dev/test, this is going to be very slow + tmp = findTranscription(timeDetail) + if tmp != -1: + mergedTranslation = mergedTranslation + " " + tmp + mergedTranslation = mergedTranslation.strip() + transWords = [words[int(x)] for x in mergedTranslation.split()] + newFile.write(" ".join(transWords) + "\n") + provFile.write(" ".join(transWords) + "\n") + + newFile.close() +provFile.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh new file mode 100755 index 00000000000..1b54b304e50 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +. ./cmd.sh + +for iter in 1 2 3 4; do + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri5a/decode_test data/lang data/test exp/sgmm2x_6a/decode_test_fmllr \ + exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it$iter & +done + + +for iter in 1 2 3 4; do + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri5a/decode_dev data/lang data/dev exp/sgmm2x_6a/decode_dev_fmllr \ + exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it$iter & +done + + +for iter in 1 2 3 4; do + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri5a/decode_dev2 data/lang data/dev2 exp/sgmm2x_6a/decode_dev2_fmllr \ + exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it$iter & +done diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh new file mode 100755 index 00000000000..aa06fdbb293 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) +# 2015 Guoguo Chen +# 2017 Hainan Xu +# 2017 Xiaohui Zhang + +# This script trains LMs on the swbd LM-training data. + +# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0. +# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71 +# Dev objf: -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91 + + +dir=Spanish_gigawrd/rnnlm +pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned +wordslist= +embedding_dim=1024 +lstm_rpd=256 +lstm_nrpd=256 +stage=0 +train_stage=-30 +text=Spanish_gigawrd/text_lm +text_dir=Spanish_gigawrd/text_lm + +. ./cmd.sh +. ./utils/parse_options.sh + +mkdir -p $dir/config +set -e + +for f in $text/dev.txt; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist;" && exit 1 +done + +if [ $stage -le 0 ]; then + if [ -f $text_dir/unigram_weights ] ; then + mv $text_dir/unigram_weights $pocolm_dir/ + fi + cp $wordslist $dir/config/words.txt + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt + rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \ + --unk-word="" \ + --data-weights-file=$dir/config/data_weights.txt \ + $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --special-words=',,,,[noise],[laughter]' \ + $dir/config/words.txt > $dir/config/features.txt +fi + +if [ $stage -le 1 ]; then + cat <$dir/config/xconfig + input dim=$embedding_dim name=input + relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1)) + fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd + relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3)) + fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd + relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3)) + output-layer name=output include-log-softmax=false dim=$embedding_dim +EOF + rnnlm/validate_config_dir.sh $text_dir $dir/config +fi + +if [ $stage -le 2 ]; then + rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir +fi + +if [ $stage -le 3 ]; then + rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \ + --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir +fi + +exit 0 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh new file mode 100755 index 00000000000..4a26f6857b8 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -euo pipefail + +punctuation_symbols=( "," "\"" "\`" "\:" "(" ")" "-" ";" "?" "!" "/" "_" "{" "}" "*" ) + +config=$1 +path_prefix=$2 +data=$3 +job=$4 +dir=$5 + +substitute_arg="" +num_syms=0 + +for i in "${punctuation_symbols[@]}"; do + symbol=${punctuation_symbols[${num_syms}]} + if [ $num_syms -eq 0 ]; then + substitute_arg="sed 's:${i}: :g'" + else + substitute_arg=$substitute_arg" | sed 's:${i}: :g'" + fi + substitute_arg=$substitute_arg" |sed 's:${i}$: :g' | sed 's:^${i}: :g'" + num_syms=$((num_syms+1)) +done +mkdir -p $dir/normalize/$job +echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh +bash $dir/normalize/$job/substitute.sh | \ + sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \ + sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' > $dir/normalize/$job/text +normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt + +exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh new file mode 100755 index 00000000000..9148b1f1171 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +# This is as run_sgmm2.sh but excluding the "speaker-dependent weights", +# so not doing the symmetric SGMM. + +. ./cmd.sh + +## SGMM on top of LDA+MLLT+SAT features. +if [ ! -f exp/ubm6a/final.mdl ]; then + steps/train_ubm.sh --silence-weight 0.5 --cmd "$train_cmd" 800 data/train data/lang exp/tri5a_ali exp/ubm6a || exit 1; +fi +# Double the number of SAT states : sanjeev +steps/train_sgmm2.sh --spk-dep-weights false --cmd "$train_cmd" 10000 120000 \ + data/train data/lang exp/tri5a_ali exp/ubm6a/final.ubm exp/sgmm2x_6a || exit 1; + +utils/mkgraph.sh data/lang_test exp/sgmm2x_6a exp/sgmm2x_6a/graph || exit 1; + +steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ + --transform-dir exp/tri5a/decode_dev exp/sgmm2x_6a/graph data/dev exp/sgmm2x_6a/decode_dev || exit 1; + +steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ + --transform-dir exp/tri5a/decode_dev exp/sgmm2x_6a/graph data/dev exp/sgmm2x_6a/decode_dev_fmllr || exit 1; + +steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ + --transform-dir exp/tri5a/decode_test exp/sgmm2x_6a/graph data/test exp/sgmm2x_6a/decode_test || exit 1; + +steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ + --transform-dir exp/tri5a/decode_test exp/sgmm2x_6a/graph data/test exp/sgmm2x_6a/decode_test_fmllr || exit 1; + +steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ + --transform-dir exp/tri5a/decode_dev2 exp/sgmm2x_6a/graph data/dev2 exp/sgmm2x_6a/decode_dev2 || exit 1; + +steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ + --transform-dir exp/tri5a/decode_dev2 exp/sgmm2x_6a/graph data/dev2 exp/sgmm2x_6a/decode_dev2_fmllr || exit 1; + + # Now we'll align the SGMM system to prepare for discriminative training. + steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri5a \ + --use-graphs true --use-gselect true data/train data/lang exp/sgmm2x_6a exp/sgmm2x_6a_ali || exit 1; + steps/make_denlats_sgmm2.sh --nj 30 --sub-split 210 --cmd "$decode_cmd" --transform-dir exp/tri5a \ + data/train data/lang exp/sgmm2x_6a_ali exp/sgmm2x_6a_denlats + steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri5a --boost 0.2 \ + data/train data/lang exp/sgmm2x_6a_ali exp/sgmm2x_6a_denlats exp/sgmm2x_6a_mmi_b0.2 + + for iter in 1 2 3 4; do + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ + --transform-dir exp/tri5a/decode_test data/lang data/test exp/sgmm2x_6a/decode_test exp/sgmm2x_6a_mmi_b0.2/decode_test_it$iter & + done + +wait +steps/decode_combine.sh data/test data/lang exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode || exit 1; +steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_mmi/decode exp/combine_sgmm2x_4a_3b/decode || exit 1; +# combining the sgmm run and the best MMI+fMMI run. +steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_fmmi_c/decode_it5 exp/combine_sgmm2x_4a_3b_fmmic5/decode || exit 1; + +steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a_mmi_b0.2/decode_it4 exp/tri3b_fmmi_c/decode_it5 exp/combine_sgmm2x_4a_mmi_3b_fmmic5/decode || exit 1; + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh new file mode 120000 index 00000000000..0afefc3158c --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh @@ -0,0 +1 @@ +../steps/score_kaldi.sh \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh new file mode 100755 index 00000000000..21b793a4d27 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +oracle_dir=exp/tri5a/decode_callhome_test/oracle +split=callhome_test +data_dir=data/callhome_test +lang_dir=data/lang + +# Make sure that your STM and CTM files are in UTF-8 encoding +# Any other encoding will cause this script to fail/misbehave + +if [ ! -e $oracle_dir -o ! -e $data_dir -o ! -e $lang_dir ]; then + echo "Missing pre-requisites" + exit 1 +fi + +for i in {5..20}; do + mkdir -p $oracle_dir/score_$i + cp $oracle_dir/$split.ctm $oracle_dir/score_$i/ +done + +. /export/babel/data/software/env.sh + +# Start scoring +/export/a11/guoguo/babel/103-bengali-limitedLP.official/local/score_stm.sh $data_dir $lang_dir \ + $oracle_dir + +# Print a summary of the result +grep "Percent Total Error" $oracle_dir/score_*/$split.ctm.dtl diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev new file mode 100644 index 00000000000..77e3b01786f --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev @@ -0,0 +1,20 @@ +sp_0897.sph +sp_0968.sph +sp_0981.sph +sp_1062.sph +sp_1292.sph +sp_1411.sph +sp_1413.sph +sp_1552.sph +sp_1554.sph +sp_1805.sph +sp_1808.sph +sp_1882.sph +sp_1930.sph +sp_1947.sph +sp_2037.sph +sp_2054.sph +sp_2057.sph +sp_2107.sph +sp_2109.sph +sp_2144.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev new file mode 100644 index 00000000000..77e3b01786f --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev @@ -0,0 +1,20 @@ +sp_0897.sph +sp_0968.sph +sp_0981.sph +sp_1062.sph +sp_1292.sph +sp_1411.sph +sp_1413.sph +sp_1552.sph +sp_1554.sph +sp_1805.sph +sp_1808.sph +sp_1882.sph +sp_1930.sph +sp_1947.sph +sp_2037.sph +sp_2054.sph +sp_2057.sph +sp_2107.sph +sp_2109.sph +sp_2144.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test new file mode 100644 index 00000000000..0cbc3cc95fd --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test @@ -0,0 +1,20 @@ +sp_0053.sph +sp_0082.sph +sp_0084.sph +sp_0088.sph +sp_0681.sph +sp_0699.sph +sp_0776.sph +sp_0857.sph +sp_1031.sph +sp_1100.sph +sp_1148.sph +sp_1156.sph +sp_1186.sph +sp_1212.sph +sp_1345.sph +sp_1435.sph +sp_1578.sph +sp_1648.sph +sp_1807.sph +sp_1847.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train new file mode 100644 index 00000000000..2c936072534 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train @@ -0,0 +1,80 @@ +sp_0085.sph +sp_0096.sph +sp_0098.sph +sp_0100.sph +sp_0291.sph +sp_0713.sph +sp_0724.sph +sp_0726.sph +sp_0731.sph +sp_0733.sph +sp_0753.sph +sp_0788.sph +sp_0826.sph +sp_0831.sph +sp_0836.sph +sp_0841.sph +sp_0850.sph +sp_0855.sph +sp_0892.sph +sp_0899.sph +sp_0910.sph +sp_0917.sph +sp_0919.sph +sp_0923.sph +sp_0945.sph +sp_0950.sph +sp_0951.sph +sp_0992.sph +sp_0997.sph +sp_1013.sph +sp_1039.sph +sp_1044.sph +sp_1045.sph +sp_1058.sph +sp_1060.sph +sp_1063.sph +sp_1081.sph +sp_1106.sph +sp_1122.sph +sp_1140.sph +sp_1175.sph +sp_1195.sph +sp_1198.sph +sp_1231.sph +sp_1234.sph +sp_1255.sph +sp_1260.sph +sp_1261.sph +sp_1262.sph +sp_1264.sph +sp_1266.sph +sp_1273.sph +sp_1275.sph +sp_1284.sph +sp_1286.sph +sp_1304.sph +sp_1308.sph +sp_1333.sph +sp_1341.sph +sp_1353.sph +sp_1368.sph +sp_1379.sph +sp_1384.sph +sp_1449.sph +sp_1463.sph +sp_1574.sph +sp_1740.sph +sp_1759.sph +sp_1849.sph +sp_1908.sph +sp_1915.sph +sp_1918.sph +sp_1974.sph +sp_1976.sph +sp_1988.sph +sp_2000.sph +sp_2056.sph +sp_2070.sph +sp_2091.sph +sp_2101.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev new file mode 100644 index 00000000000..d3769f0ffb5 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev @@ -0,0 +1,20 @@ +20051009_182032_217_fsp.sph +20051009_210519_219_fsp.sph +20051010_212418_225_fsp.sph +20051016_180547_265_fsp.sph +20051016_210626_267_fsp.sph +20051017_180712_270_fsp.sph +20051017_220530_275_fsp.sph +20051017_234550_276_fsp.sph +20051018_210220_279_fsp.sph +20051018_210744_280_fsp.sph +20051019_190221_288_fsp.sph +20051019_210146_289_fsp.sph +20051019_230329_292_fsp.sph +20051022_180817_311_fsp.sph +20051023_232057_325_fsp.sph +20051024_180453_327_fsp.sph +20051024_181110_329_fsp.sph +20051025_212334_337_fsp.sph +20051026_180724_341_fsp.sph +20051026_211309_346_fsp.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 new file mode 100644 index 00000000000..f1b5c293d67 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 @@ -0,0 +1,20 @@ +20050909_210655_26_fsp.sph +20050910_210708_33_fsp.sph +20050913_210933_49_fsp.sph +20050913_211649_50_fsp.sph +20050915_210434_65_fsp.sph +20050916_180332_68_fsp.sph +20050918_180733_81_fsp.sph +20050918_210841_82_fsp.sph +20050920_212030_93_fsp.sph +20050921_210443_99_fsp.sph +20050923_211304_115_fsp.sph +20050925_180713_120_fsp.sph +20050925_180825_121_fsp.sph +20050926_180516_125_fsp.sph +20050926_180555_126_fsp.sph +20050928_000254_141_fsp.sph +20050930_210540_161_fsp.sph +20051002_180726_170_fsp.sph +20051007_181850_205_fsp.sph +20051007_191217_206_fsp.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test new file mode 100644 index 00000000000..6190ced077c --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test @@ -0,0 +1,20 @@ +20051028_180633_356_fsp.sph +20051029_211606_365_fsp.sph +20051030_193924_371_fsp.sph +20051101_212731_386_fsp.sph +20051102_134901_389_fsp.sph +20051102_180402_391_fsp.sph +20051102_181501_393_fsp.sph +20051103_211105_404_fsp.sph +20051103_233456_406_fsp.sph +20051107_184634_438_fsp.sph +20051109_180253_445_fsp.sph +20051109_210353_450_fsp.sph +20051111_181045_470_fsp.sph +20051111_182216_472_fsp.sph +20051112_181649_485_fsp.sph +20051113_155059_492_fsp.sph +20051113_210221_496_fsp.sph +20051113_214925_498_fsp.sph +20051114_181749_505_fsp.sph +20051115_212123_516_fsp.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train new file mode 100644 index 00000000000..b57683842b2 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train @@ -0,0 +1,759 @@ +20050908_182943_22_fsp.sph +20050908_191808_23_fsp.sph +20050909_210428_25_fsp.sph +20050909_221657_28_fsp.sph +20050910_180310_29_fsp.sph +20050910_180330_30_fsp.sph +20050910_181354_31_fsp.sph +20050910_190223_32_fsp.sph +20050911_180647_34_fsp.sph +20050911_200216_35_fsp.sph +20050911_210429_36_fsp.sph +20050911_210530_37_fsp.sph +20050911_210904_38_fsp.sph +20050912_181441_40_fsp.sph +20050912_181538_41_fsp.sph +20050912_182044_42_fsp.sph +20050912_212913_43_fsp.sph +20050913_180324_44_fsp.sph +20050913_180731_46_fsp.sph +20050913_180947_47_fsp.sph +20050913_210409_48_fsp.sph +20050914_000831_51_fsp.sph +20050914_180332_52_fsp.sph +20050914_180606_53_fsp.sph +20050914_181020_54_fsp.sph +20050914_210243_55_fsp.sph +20050914_210822_56_fsp.sph +20050914_220753_58_fsp.sph +20050915_180728_60_fsp.sph +20050915_180740_61_fsp.sph +20050915_192457_62_fsp.sph +20050915_194045_63_fsp.sph +20050915_210200_64_fsp.sph +20050915_210916_66_fsp.sph +20050915_212325_67_fsp.sph +20050916_180740_69_fsp.sph +20050916_200334_70_fsp.sph +20050916_210235_71_fsp.sph +20050916_210510_72_fsp.sph +20050916_223656_73_fsp.sph +20050917_210406_74_fsp.sph +20050917_210805_75_fsp.sph +20050917_211045_76_fsp.sph +20050917_212041_77_fsp.sph +20050918_180326_80_fsp.sph +20050919_000612_83_fsp.sph +20050919_180511_84_fsp.sph +20050919_180703_85_fsp.sph +20050919_180925_86_fsp.sph +20050919_190254_87_fsp.sph +20050920_180330_88_fsp.sph +20050920_180342_89_fsp.sph +20050920_180607_90_fsp.sph +20050920_181919_91_fsp.sph +20050920_211414_92_fsp.sph +20050920_230520_94_fsp.sph +20050921_180639_95_fsp.sph +20050921_181002_96_fsp.sph +20050921_210340_98_fsp.sph +20050921_211329_101_fsp.sph +20050921_221625_102_fsp.sph +20050922_180618_103_fsp.sph +20050922_180948_104_fsp.sph +20050922_210740_106_fsp.sph +20050922_211003_107_fsp.sph +20050922_230412_108_fsp.sph +20050923_180514_110_fsp.sph +20050923_180530_111_fsp.sph +20050923_210442_114_fsp.sph +20050924_180747_117_fsp.sph +20050924_181124_118_fsp.sph +20050925_210645_122_fsp.sph +20050925_231407_123_fsp.sph +20050926_000425_124_fsp.sph +20050926_180719_127_fsp.sph +20050926_220244_130_fsp.sph +20050926_230706_131_fsp.sph +20050927_180422_132_fsp.sph +20050927_181033_133_fsp.sph +20050927_181232_134_fsp.sph +20050927_210320_135_fsp.sph +20050927_210848_136_fsp.sph +20050927_210947_138_fsp.sph +20050927_211929_139_fsp.sph +20050927_231016_140_fsp.sph +20050928_180631_142_fsp.sph +20050928_210256_144_fsp.sph +20050928_210700_145_fsp.sph +20050928_211113_146_fsp.sph +20050928_220320_147_fsp.sph +20050928_232236_148_fsp.sph +20050929_180318_149_fsp.sph +20050929_180722_150_fsp.sph +20050929_180932_151_fsp.sph +20050929_211337_153_fsp.sph +20050929_220820_154_fsp.sph +20050929_230406_155_fsp.sph +20050930_180329_156_fsp.sph +20050930_180411_157_fsp.sph +20050930_180646_158_fsp.sph +20050930_200308_159_fsp.sph +20051001_180328_163_fsp.sph +20051001_181004_164_fsp.sph +20051001_210749_166_fsp.sph +20051001_211346_167_fsp.sph +20051002_180339_169_fsp.sph +20051002_210324_171_fsp.sph +20051002_220651_174_fsp.sph +20051003_180434_175_fsp.sph +20051003_211042_178_fsp.sph +20051003_220633_179_fsp.sph +20051004_180351_180_fsp.sph +20051004_180542_181_fsp.sph +20051004_180730_182_fsp.sph +20051004_200737_183_fsp.sph +20051004_211611_185_fsp.sph +20051005_180420_187_fsp.sph +20051005_180709_188_fsp.sph +20051005_213606_191_fsp.sph +20051005_220917_192_fsp.sph +20051005_230659_193_fsp.sph +20051006_180416_194_fsp.sph +20051006_180653_195_fsp.sph +20051006_180815_196_fsp.sph +20051006_181525_197_fsp.sph +20051006_183153_199_fsp.sph +20051006_210246_200_fsp.sph +20051006_210417_201_fsp.sph +20051006_220329_203_fsp.sph +20051008_000036_208_fsp.sph +20051008_180249_209_fsp.sph +20051008_181720_210_fsp.sph +20051008_183224_211_fsp.sph +20051008_190256_212_fsp.sph +20051008_211712_214_fsp.sph +20051008_213416_215_fsp.sph +20051009_180444_216_fsp.sph +20051009_190753_218_fsp.sph +20051009_220443_221_fsp.sph +20051010_180650_222_fsp.sph +20051010_182706_223_fsp.sph +20051010_210622_224_fsp.sph +20051010_222853_227_fsp.sph +20051010_231630_228_fsp.sph +20051011_181919_230_fsp.sph +20051011_211026_232_fsp.sph +20051011_220348_233_fsp.sph +20051012_180233_234_fsp.sph +20051012_190241_236_fsp.sph +20051012_193952_237_fsp.sph +20051012_224157_239_fsp.sph +20051013_180458_240_fsp.sph +20051013_180613_241_fsp.sph +20051013_180700_242_fsp.sph +20051013_182213_244_fsp.sph +20051013_210221_245_fsp.sph +20051013_210425_246_fsp.sph +20051013_210941_247_fsp.sph +20051013_220243_248_fsp.sph +20051014_180259_249_fsp.sph +20051014_180940_250_fsp.sph +20051014_180948_251_fsp.sph +20051014_183707_252_fsp.sph +20051014_210348_253_fsp.sph +20051014_210647_254_fsp.sph +20051014_220227_256_fsp.sph +20051014_230339_257_fsp.sph +20051015_180549_258_fsp.sph +20051015_190247_259_fsp.sph +20051015_210138_260_fsp.sph +20051015_210701_261_fsp.sph +20051015_210831_262_fsp.sph +20051016_180926_266_fsp.sph +20051017_000346_269_fsp.sph +20051017_210137_273_fsp.sph +20051017_215732_274_fsp.sph +20051018_180559_277_fsp.sph +20051018_180816_278_fsp.sph +20051018_211701_282_fsp.sph +20051018_231046_283_fsp.sph +20051018_235317_284_fsp.sph +20051019_180448_285_fsp.sph +20051019_183344_287_fsp.sph +20051020_180339_293_fsp.sph +20051020_180759_295_fsp.sph +20051020_210218_297_fsp.sph +20051020_212525_299_fsp.sph +20051020_222944_300_fsp.sph +20051020_234953_301_fsp.sph +20051021_180218_302_fsp.sph +20051021_180508_303_fsp.sph +20051021_190605_304_fsp.sph +20051021_210159_305_fsp.sph +20051021_210530_306_fsp.sph +20051021_222225_307_fsp.sph +20051022_001311_309_fsp.sph +20051022_180452_310_fsp.sph +20051022_180829_312_fsp.sph +20051022_190406_313_fsp.sph +20051022_200517_314_fsp.sph +20051022_210920_315_fsp.sph +20051022_230324_316_fsp.sph +20051022_232428_317_fsp.sph +20051023_180342_318_fsp.sph +20051023_180530_319_fsp.sph +20051023_190301_321_fsp.sph +20051023_210258_322_fsp.sph +20051023_210605_323_fsp.sph +20051023_223751_324_fsp.sph +20051024_000348_326_fsp.sph +20051024_180624_328_fsp.sph +20051024_210748_330_fsp.sph +20051024_211346_331_fsp.sph +20051024_221753_332_fsp.sph +20051024_230857_333_fsp.sph +20051025_180351_334_fsp.sph +20051025_210532_335_fsp.sph +20051025_210959_336_fsp.sph +20051025_220419_338_fsp.sph +20051026_180611_340_fsp.sph +20051026_190359_343_fsp.sph +20051026_210334_344_fsp.sph +20051026_211202_345_fsp.sph +20051026_230956_347_fsp.sph +20051026_234001_348_fsp.sph +20051027_180217_349_fsp.sph +20051027_210159_351_fsp.sph +20051027_210333_352_fsp.sph +20051027_211525_353_fsp.sph +20051027_231329_354_fsp.sph +20051028_180329_355_fsp.sph +20051028_210350_358_fsp.sph +20051028_211904_359_fsp.sph +20051029_200218_363_fsp.sph +20051029_210442_364_fsp.sph +20051029_220538_366_fsp.sph +20051030_000333_367_fsp.sph +20051030_180521_368_fsp.sph +20051030_181001_369_fsp.sph +20051030_190231_370_fsp.sph +20051030_210903_372_fsp.sph +20051030_230444_373_fsp.sph +20051031_180213_374_fsp.sph +20051031_180906_375_fsp.sph +20051031_210229_377_fsp.sph +20051031_220447_379_fsp.sph +20051101_153940_380_fsp.sph +20051101_211314_384_fsp.sph +20051101_223911_387_fsp.sph +20051101_230216_388_fsp.sph +20051102_175957_390_fsp.sph +20051102_210243_394_fsp.sph +20051102_210828_395_fsp.sph +20051102_211130_396_fsp.sph +20051103_163507_398_fsp.sph +20051103_180920_400_fsp.sph +20051103_185102_401_fsp.sph +20051103_210539_403_fsp.sph +20051103_223906_405_fsp.sph +20051104_123901_407_fsp.sph +20051104_180145_408_fsp.sph +20051104_181437_409_fsp.sph +20051104_190247_410_fsp.sph +20051104_210307_411_fsp.sph +20051104_210814_412_fsp.sph +20051104_212121_413_fsp.sph +20051104_222117_414_fsp.sph +20051104_231424_416_fsp.sph +20051105_175657_418_fsp.sph +20051105_181203_419_fsp.sph +20051105_210724_421_fsp.sph +20051105_220745_422_fsp.sph +20051106_180232_424_fsp.sph +20051106_181321_425_fsp.sph +20051106_190219_426_fsp.sph +20051106_200213_427_fsp.sph +20051106_210215_428_fsp.sph +20051106_210310_429_fsp.sph +20051106_211252_430_fsp.sph +20051106_211804_431_fsp.sph +20051106_215339_432_fsp.sph +20051106_221653_433_fsp.sph +20051107_115855_434_fsp.sph +20051107_160351_435_fsp.sph +20051107_180332_436_fsp.sph +20051107_182401_437_fsp.sph +20051107_210309_439_fsp.sph +20051107_212723_440_fsp.sph +20051108_145902_441_fsp.sph +20051108_181424_442_fsp.sph +20051108_210224_443_fsp.sph +20051108_212018_444_fsp.sph +20051109_180413_446_fsp.sph +20051109_181432_447_fsp.sph +20051109_181906_448_fsp.sph +20051109_183631_449_fsp.sph +20051109_210436_451_fsp.sph +20051109_211151_452_fsp.sph +20051109_212148_453_fsp.sph +20051109_232505_454_fsp.sph +20051110_155523_455_fsp.sph +20051110_180208_456_fsp.sph +20051110_180838_457_fsp.sph +20051110_182221_459_fsp.sph +20051110_182318_460_fsp.sph +20051110_210200_461_fsp.sph +20051110_210233_462_fsp.sph +20051110_210454_463_fsp.sph +20051110_211110_464_fsp.sph +20051110_212818_466_fsp.sph +20051110_225245_467_fsp.sph +20051111_181441_471_fsp.sph +20051111_184451_474_fsp.sph +20051111_190326_475_fsp.sph +20051111_194004_477_fsp.sph +20051111_201357_478_fsp.sph +20051111_230329_480_fsp.sph +20051112_000305_482_fsp.sph +20051112_165916_483_fsp.sph +20051112_185651_487_fsp.sph +20051112_190443_488_fsp.sph +20051112_210205_489_fsp.sph +20051112_210631_490_fsp.sph +20051112_231502_491_fsp.sph +20051113_180809_493_fsp.sph +20051113_210908_497_fsp.sph +20051113_220433_499_fsp.sph +20051114_171942_502_fsp.sph +20051114_181118_504_fsp.sph +20051114_210412_506_fsp.sph +20051114_212032_507_fsp.sph +20051114_215057_508_fsp.sph +20051114_220412_509_fsp.sph +20051114_225557_510_fsp.sph +20051115_134012_511_fsp.sph +20051115_180301_512_fsp.sph +20051115_181412_513_fsp.sph +20051115_181731_514_fsp.sph +20051115_182149_515_fsp.sph +20051115_213551_517_fsp.sph +20051115_215935_518_fsp.sph +20051115_230749_520_fsp.sph +20051116_000221_521_fsp.sph +20051116_172353_522_fsp.sph +20051116_180237_524_fsp.sph +20051116_181228_525_fsp.sph +20051116_181816_526_fsp.sph +20051116_190450_527_fsp.sph +20051116_210146_528_fsp.sph +20051116_210553_529_fsp.sph +20051116_211222_530_fsp.sph +20051116_212312_531_fsp.sph +20051116_222454_532_fsp.sph +20051116_233038_533_fsp.sph +20051117_001013_534_fsp.sph +20051117_180234_535_fsp.sph +20051117_181844_537_fsp.sph +20051117_210156_538_fsp.sph +20051117_210403_539_fsp.sph +20051117_211540_540_fsp.sph +20051117_211833_541_fsp.sph +20051117_212855_542_fsp.sph +20051117_213407_543_fsp.sph +20051117_220412_544_fsp.sph +20051117_225943_545_fsp.sph +20051118_180619_547_fsp.sph +20051118_180739_548_fsp.sph +20051118_182114_549_fsp.sph +20051118_182652_550_fsp.sph +20051118_210212_551_fsp.sph +20051118_210455_552_fsp.sph +20051118_212058_553_fsp.sph +20051118_212829_554_fsp.sph +20051119_000355_555_fsp.sph +20051119_181105_556_fsp.sph +20051119_210802_557_fsp.sph +20051119_212315_559_fsp.sph +20051119_214926_560_fsp.sph +20051120_181008_561_fsp.sph +20051120_181339_562_fsp.sph +20051120_190412_563_fsp.sph +20051120_205645_565_fsp.sph +20051120_210347_566_fsp.sph +20051120_211526_567_fsp.sph +20051121_181138_569_fsp.sph +20051121_181357_570_fsp.sph +20051121_190155_571_fsp.sph +20051121_210922_573_fsp.sph +20051122_181114_574_fsp.sph +20051122_190326_576_fsp.sph +20051122_210253_577_fsp.sph +20051122_210703_578_fsp.sph +20051122_211805_579_fsp.sph +20051122_213037_580_fsp.sph +20051122_215430_581_fsp.sph +20051123_180926_582_fsp.sph +20051123_181644_583_fsp.sph +20051123_210214_584_fsp.sph +20051123_211514_585_fsp.sph +20051123_212412_586_fsp.sph +20051123_213259_587_fsp.sph +20051124_181720_588_fsp.sph +20051124_190336_589_fsp.sph +20051124_212221_591_fsp.sph +20051124_220457_592_fsp.sph +20051125_181632_593_fsp.sph +20051125_190327_594_fsp.sph +20051125_212150_595_fsp.sph +20051126_181804_597_fsp.sph +20051126_190347_598_fsp.sph +20051126_210222_599_fsp.sph +20051127_181335_601_fsp.sph +20051127_190405_602_fsp.sph +20051127_210516_603_fsp.sph +20051127_211200_604_fsp.sph +20051127_212516_605_fsp.sph +20051128_215149_608_fsp.sph +20051128_222007_609_fsp.sph +20051129_180204_610_fsp.sph +20051129_181241_612_fsp.sph +20051129_181547_613_fsp.sph +20051129_183449_614_fsp.sph +20051129_190152_615_fsp.sph +20051129_210218_616_fsp.sph +20051129_210342_617_fsp.sph +20051129_212711_618_fsp.sph +20051130_181543_619_fsp.sph +20051130_182626_620_fsp.sph +20051130_210202_622_fsp.sph +20051130_210910_623_fsp.sph +20051130_212724_626_fsp.sph +20051130_220121_627_fsp.sph +20051130_221538_628_fsp.sph +20051201_181034_630_fsp.sph +20051201_181303_631_fsp.sph +20051201_183429_632_fsp.sph +20051201_191426_633_fsp.sph +20051201_193415_634_fsp.sph +20051201_195005_635_fsp.sph +20051201_210713_636_fsp.sph +20051201_212329_637_fsp.sph +20051201_230640_638_fsp.sph +20051202_181119_639_fsp.sph +20051202_181659_640_fsp.sph +20051202_182058_641_fsp.sph +20051202_184713_642_fsp.sph +20051202_190154_643_fsp.sph +20051202_193515_644_fsp.sph +20051202_210252_645_fsp.sph +20051202_211824_646_fsp.sph +20051202_212105_647_fsp.sph +20051203_180701_649_fsp.sph +20051203_182100_650_fsp.sph +20051203_182132_651_fsp.sph +20051203_182418_652_fsp.sph +20051203_183501_653_fsp.sph +20051203_190503_654_fsp.sph +20051203_191125_655_fsp.sph +20051203_210216_656_fsp.sph +20051203_212114_658_fsp.sph +20051203_222533_661_fsp.sph +20051206_180753_662_fsp.sph +20051206_180911_663_fsp.sph +20051206_181649_664_fsp.sph +20051206_183057_665_fsp.sph +20051206_193937_667_fsp.sph +20051206_201757_668_fsp.sph +20051206_203158_669_fsp.sph +20051206_210127_670_fsp.sph +20051206_210744_671_fsp.sph +20051206_211522_672_fsp.sph +20051206_213252_673_fsp.sph +20051206_214122_674_fsp.sph +20051206_231328_675_fsp.sph +20051207_180507_676_fsp.sph +20051207_181020_677_fsp.sph +20051207_190155_678_fsp.sph +20051207_190426_679_fsp.sph +20051207_193103_681_fsp.sph +20051207_211858_683_fsp.sph +20051207_212300_684_fsp.sph +20051207_212831_685_fsp.sph +20051207_214411_686_fsp.sph +20051208_180208_687_fsp.sph +20051208_180810_688_fsp.sph +20051208_182430_689_fsp.sph +20051208_190333_690_fsp.sph +20051208_210609_691_fsp.sph +20051208_211702_692_fsp.sph +20051208_212444_694_fsp.sph +20051208_214100_696_fsp.sph +20051208_220606_697_fsp.sph +20051209_180824_699_fsp.sph +20051209_181542_700_fsp.sph +20051209_181642_701_fsp.sph +20051209_182541_702_fsp.sph +20051209_182858_703_fsp.sph +20051209_210136_704_fsp.sph +20051209_210452_705_fsp.sph +20051209_211542_706_fsp.sph +20051209_212515_707_fsp.sph +20051209_222427_709_fsp.sph +20051209_231702_710_fsp.sph +20051210_180659_711_fsp.sph +20051210_181201_712_fsp.sph +20051210_182013_713_fsp.sph +20051210_182603_714_fsp.sph +20051210_190201_715_fsp.sph +20051210_210535_717_fsp.sph +20051210_210735_718_fsp.sph +20051211_000414_719_fsp.sph +20051211_181346_720_fsp.sph +20051211_182045_721_fsp.sph +20051211_184252_723_fsp.sph +20051211_190523_724_fsp.sph +20051211_210240_725_fsp.sph +20051211_211415_726_fsp.sph +20051212_180251_727_fsp.sph +20051212_181817_728_fsp.sph +20051212_182453_729_fsp.sph +20051212_190335_730_fsp.sph +20051212_210527_731_fsp.sph +20051212_210738_732_fsp.sph +20051212_211419_733_fsp.sph +20051212_213447_734_fsp.sph +20051212_214512_735_fsp.sph +20051213_180254_736_fsp.sph +20051213_185913_737_fsp.sph +20051213_191741_738_fsp.sph +20051213_210120_739_fsp.sph +20051213_211552_741_fsp.sph +20051213_211953_742_fsp.sph +20051213_221424_743_fsp.sph +20051213_222016_744_fsp.sph +20051214_193942_746_fsp.sph +20051214_194606_747_fsp.sph +20051214_201000_748_fsp.sph +20051214_202717_749_fsp.sph +20051214_211653_750_fsp.sph +20051214_212318_751_fsp.sph +20051214_212718_752_fsp.sph +20051214_213225_753_fsp.sph +20051215_180855_754_fsp.sph +20051215_181731_755_fsp.sph +20051215_182213_756_fsp.sph +20051215_190143_757_fsp.sph +20051215_190419_758_fsp.sph +20051215_195526_759_fsp.sph +20051215_200925_760_fsp.sph +20051215_201639_761_fsp.sph +20051215_203848_762_fsp.sph +20051215_210410_764_fsp.sph +20051215_212456_766_fsp.sph +20051215_212701_767_fsp.sph +20051215_212749_768_fsp.sph +20051215_214814_769_fsp.sph +20051215_220537_770_fsp.sph +20051215_222306_771_fsp.sph +20051216_181042_773_fsp.sph +20051216_182340_774_fsp.sph +20051216_191101_775_fsp.sph +20051216_192823_776_fsp.sph +20051216_200153_777_fsp.sph +20051216_211423_778_fsp.sph +20051216_220626_779_fsp.sph +20051217_142547_780_fsp.sph +20051217_180231_781_fsp.sph +20051217_182026_783_fsp.sph +20051217_182330_784_fsp.sph +20051217_182530_785_fsp.sph +20051217_183115_786_fsp.sph +20051217_190226_787_fsp.sph +20051218_142845_790_fsp.sph +20051218_180353_791_fsp.sph +20051218_181751_792_fsp.sph +20051218_182127_793_fsp.sph +20051218_182750_794_fsp.sph +20051218_200401_799_fsp.sph +20051218_210249_800_fsp.sph +20051218_211820_801_fsp.sph +20051218_212444_802_fsp.sph +20051218_212813_803_fsp.sph +20051219_180225_804_fsp.sph +20051219_182110_806_fsp.sph +20051219_190625_808_fsp.sph +20051219_210655_812_fsp.sph +20051219_212218_813_fsp.sph +20051219_212716_814_fsp.sph +20051219_213203_815_fsp.sph +20051219_221213_816_fsp.sph +20051219_223123_817_fsp.sph +20051220_181731_820_fsp.sph +20051220_190121_821_fsp.sph +20051220_212400_826_fsp.sph +20051220_212718_828_fsp.sph +20051220_213420_829_fsp.sph +20051221_000417_830_fsp.sph +20051221_180958_831_fsp.sph +20051221_210452_840_fsp.sph +20051221_212325_841_fsp.sph +20051221_212911_842_fsp.sph +20051222_000436_843_fsp.sph +20051222_181242_845_fsp.sph +20051222_181506_846_fsp.sph +20051222_182617_847_fsp.sph +20051222_184209_849_fsp.sph +20051222_200553_850_fsp.sph +20051222_210309_852_fsp.sph +20051222_212425_855_fsp.sph +20051223_180346_856_fsp.sph +20051223_181050_857_fsp.sph +20051223_183105_860_fsp.sph +20051223_212547_863_fsp.sph +20051223_212853_864_fsp.sph +20051224_180302_865_fsp.sph +20051224_182949_867_fsp.sph +20051224_210150_870_fsp.sph +20051224_213010_871_fsp.sph +20051225_192042_872_fsp.sph +20051225_210556_873_fsp.sph +20051226_180908_874_fsp.sph +20051226_181659_875_fsp.sph +20051227_181058_885_fsp.sph +20051227_211308_887_fsp.sph +20051227_213029_888_fsp.sph +20051227_214843_889_fsp.sph +20051227_220309_890_fsp.sph +20051228_180249_891_fsp.sph +20051228_182051_892_fsp.sph +20051228_183955_893_fsp.sph +20051228_210524_896_fsp.sph +20051228_211808_897_fsp.sph +20051228_212304_899_fsp.sph +20051228_212734_900_fsp.sph +20051228_223227_901_fsp.sph +20051229_180231_902_fsp.sph +20051229_182614_906_fsp.sph +20051229_182631_907_fsp.sph +20051229_214024_909_fsp.sph +20051230_180457_910_fsp.sph +20051230_181721_912_fsp.sph +20051230_210412_913_fsp.sph +20051230_210559_914_fsp.sph +20051230_212557_915_fsp.sph +20051231_000808_916_fsp.sph +20060103_180314_917_fsp.sph +20060103_182107_918_fsp.sph +20060103_182257_919_fsp.sph +20060103_182549_920_fsp.sph +20060103_182654_921_fsp.sph +20060103_184037_922_fsp.sph +20060103_211504_925_fsp.sph +20060103_211732_926_fsp.sph +20060104_180509_928_fsp.sph +20060104_181040_929_fsp.sph +20060104_182115_930_fsp.sph +20060104_182644_931_fsp.sph +20060104_190448_933_fsp.sph +20060104_192707_934_fsp.sph +20060104_210223_935_fsp.sph +20060104_212844_936_fsp.sph +20060104_220148_937_fsp.sph +20060105_202127_943_fsp.sph +20060105_205957_944_fsp.sph +20060105_210951_945_fsp.sph +20060105_211743_946_fsp.sph +20060105_213129_947_fsp.sph +20060105_213243_948_fsp.sph +20060105_230711_949_fsp.sph +20060106_180202_950_fsp.sph +20060106_181040_951_fsp.sph +20060106_181726_952_fsp.sph +20060106_182909_953_fsp.sph +20060106_183056_954_fsp.sph +20060106_183550_955_fsp.sph +20060106_185224_956_fsp.sph +20060106_193129_957_fsp.sph +20060107_180634_960_fsp.sph +20060107_181553_961_fsp.sph +20060107_182715_962_fsp.sph +20060107_190206_963_fsp.sph +20060107_190415_964_fsp.sph +20060107_210435_966_fsp.sph +20060107_220739_967_fsp.sph +20060108_180630_968_fsp.sph +20060108_194731_971_fsp.sph +20060108_234917_976_fsp.sph +20060109_180448_977_fsp.sph +20060109_182557_979_fsp.sph +20060109_183636_980_fsp.sph +20060109_183727_981_fsp.sph +20060109_205815_982_fsp.sph +20060109_213409_986_fsp.sph +20060109_215138_987_fsp.sph +20060109_220315_988_fsp.sph +20060109_220535_989_fsp.sph +20060110_183405_995_fsp.sph +20060110_200611_998_fsp.sph +20060110_210730_1002_fsp.sph +20060110_213516_1004_fsp.sph +20060110_221920_1006_fsp.sph +20060110_230947_1007_fsp.sph +20060111_181650_1008_fsp.sph +20060111_182557_1009_fsp.sph +20060111_184916_1010_fsp.sph +20060111_192159_1012_fsp.sph +20060111_200345_1013_fsp.sph +20060111_210257_1014_fsp.sph +20060111_212145_1016_fsp.sph +20060111_213742_1017_fsp.sph +20060111_213936_1018_fsp.sph +20060111_230912_1020_fsp.sph +20060112_180639_1021_fsp.sph +20060112_182612_1022_fsp.sph +20060112_183346_1023_fsp.sph +20060112_183622_1024_fsp.sph +20060112_210747_1025_fsp.sph +20060112_211025_1026_fsp.sph +20060112_221010_1027_fsp.sph +20060112_221022_1028_fsp.sph +20060113_180159_1030_fsp.sph +20060113_183452_1033_fsp.sph +20060113_190403_1034_fsp.sph +20060113_213733_1036_fsp.sph +20060114_181137_1039_fsp.sph +20060114_181922_1040_fsp.sph +20060114_191056_1043_fsp.sph +20060114_213242_1044_fsp.sph +20060115_180421_1045_fsp.sph +20060115_183525_1047_fsp.sph +20060115_210217_1048_fsp.sph +20060115_212231_1051_fsp.sph +20060115_220504_1052_fsp.sph +20060115_232345_1053_fsp.sph +20060116_181908_1054_fsp.sph +20060116_182500_1055_fsp.sph +20060116_183201_1056_fsp.sph +20060116_184141_1057_fsp.sph +20060116_202324_1058_fsp.sph +20060116_204753_1059_fsp.sph +20060116_210217_1060_fsp.sph +20060116_211237_1061_fsp.sph +20060116_212845_1063_fsp.sph +20060116_220652_1064_fsp.sph +20060116_221118_1065_fsp.sph +20060117_181936_1068_fsp.sph +20060117_182604_1069_fsp.sph +20060117_185153_1071_fsp.sph +20060117_210138_1072_fsp.sph +20060117_210311_1073_fsp.sph +20060117_212546_1074_fsp.sph +20060118_180229_1076_fsp.sph +20060118_180647_1078_fsp.sph +20060118_182448_1079_fsp.sph +20060118_183010_1080_fsp.sph +20060118_190231_1082_fsp.sph +20060118_200148_1083_fsp.sph +20060118_205216_1084_fsp.sph +20060118_212907_1085_fsp.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test new file mode 100644 index 00000000000..0cbc3cc95fd --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test @@ -0,0 +1,20 @@ +sp_0053.sph +sp_0082.sph +sp_0084.sph +sp_0088.sph +sp_0681.sph +sp_0699.sph +sp_0776.sph +sp_0857.sph +sp_1031.sph +sp_1100.sph +sp_1148.sph +sp_1156.sph +sp_1186.sph +sp_1212.sph +sp_1345.sph +sp_1435.sph +sp_1578.sph +sp_1648.sph +sp_1807.sph +sp_1847.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train new file mode 100644 index 00000000000..2c936072534 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train @@ -0,0 +1,80 @@ +sp_0085.sph +sp_0096.sph +sp_0098.sph +sp_0100.sph +sp_0291.sph +sp_0713.sph +sp_0724.sph +sp_0726.sph +sp_0731.sph +sp_0733.sph +sp_0753.sph +sp_0788.sph +sp_0826.sph +sp_0831.sph +sp_0836.sph +sp_0841.sph +sp_0850.sph +sp_0855.sph +sp_0892.sph +sp_0899.sph +sp_0910.sph +sp_0917.sph +sp_0919.sph +sp_0923.sph +sp_0945.sph +sp_0950.sph +sp_0951.sph +sp_0992.sph +sp_0997.sph +sp_1013.sph +sp_1039.sph +sp_1044.sph +sp_1045.sph +sp_1058.sph +sp_1060.sph +sp_1063.sph +sp_1081.sph +sp_1106.sph +sp_1122.sph +sp_1140.sph +sp_1175.sph +sp_1195.sph +sp_1198.sph +sp_1231.sph +sp_1234.sph +sp_1255.sph +sp_1260.sph +sp_1261.sph +sp_1262.sph +sp_1264.sph +sp_1266.sph +sp_1273.sph +sp_1275.sph +sp_1284.sph +sp_1286.sph +sp_1304.sph +sp_1308.sph +sp_1333.sph +sp_1341.sph +sp_1353.sph +sp_1368.sph +sp_1379.sph +sp_1384.sph +sp_1449.sph +sp_1463.sph +sp_1574.sph +sp_1740.sph +sp_1759.sph +sp_1849.sph +sp_1908.sph +sp_1915.sph +sp_1918.sph +sp_1974.sph +sp_1976.sph +sp_1988.sph +sp_2000.sph +sp_2056.sph +sp_2070.sph +sp_2091.sph +sp_2101.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl new file mode 100755 index 00000000000..03193384670 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl @@ -0,0 +1,304 @@ +#!/usr/bin/env perl + +# Oct 21, 2015 : Gaurav Kumar (Johns Hopkins University) +# GNU General Public License, v3.0 +# +# This script was modified under GPL and is being distributed with +# Kaldi. It requires the preference and rule files +# (under LDC copyright) from LDC96L16. The main changes were +# - Outdated usage of perl conventions updated @_ => $_ or @A +# - This script no longer needs the preference and rule files to +# be in the same directory as this script. +# - Accepts tokens from instead of <> + +# --- Retained previous version information ---------------------------- +# spron.pl Version 0.1 Jan. 11 1995 +# Written by Zhibiao Wu, LDC, wzb@unagi.cis.upenn.edu +# This program needs the basic_rules file to run. The rules must be sorted +# in alphabetical order. The most specific rules should precede the more +# general ones. The conventions used in the basic rules are the same as +# regular expressions used in Perl. + +# Revised history: Feb. 10 1995 + +# The file "preferences" (assumed to be in your current directory) +# gives an "oracle" of correct pronunciations that override the +# machine-generated ones. + +# slightly changed 97/09/05 robertm: +# - look for basic_rules and preferences in $PWD instead of ~wzb/... +# - use next to shortcut loop instead of if/else +# - added a bit of documentation, without really trying to decipher this thing +# ----------------------------------------------------------------------- + +use utf8; +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +$vfile = ""; +$preference_file = ""; +$rules_file = ""; +$print_input = 0; +if ($#ARGV < 1) { + # Print Usage + print "Usage : local/spron.pl pref-file rules-file \n"; + exit 1; +} else { + $preference_file = $ARGV[0]; + $rules_file = $ARGV[1]; + if ($#ARGV > 1) { + $vfile = $ARGV[2]; + } + if ($#ARGV > 2) { + $print_input = 1; + } +} + +$rule_num = 0; +$previous = ""; +if ($vfile ne "") { + open(VF, $vfile) || die "Can't find file $vfile!\n"; + while () { + chop; + @A = split(//); + if (($A[0] ne '#') && ($_ ne "")) { + if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) { + $head[$rule_num] = $1; + $end[$rule_num] = $2; + $pre[$rule_num] = $3; + if ($4 =~ /#/) { + $nex[$rule_num] = ""; + $some[$rule_num] = $4; + } else { + $nex[$rule_num] = $4; + $some[$rule_num] = $5; + } + if ($previous ne substr($head[$rule_num],0,1)) { + $first{$head[$rule_num]} = $rule_num; + $last{$previous} = $rule_num - 1; + } + $previous = substr($head[$rule_num++],0,1); + } else { + print "Rule format error: Cannot parse $_\n"; + exit(1); + } + } + } + $last{$previous} = $rule_num - 1; + + close(VF); +} + +open(PF, $preference_file) || die "Can't read `preferences' file"; +binmode(PF, ":iso88591"); +while () { + chop; + if ($_ ne "") { + @A = split; + $pron{$A[0]} = $A[1]; + $stre{$A[0]} = $A[2]; + } +} + +$previous = ""; +$brule_num = 0; +open(BF, $rules_file) || die "Can't read `basic_rules' file"; +binmode(BF, ":iso88591"); +while () { + chop; + @A = split(//); + if (($A[0] ne '#') && ($_ ne "")) { + if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) { + $bhead[$brule_num] = $1; + $bend[$brule_num] = $2; + $bpre[$brule_num] = $3; + if ($4 =~ /#/) { + $bnex[$brule_num] = ""; + $bsome[$brule_num] = $4; + } else { + $bnex[$brule_num] = $4; + $bsome[$brule_num] = $5; + } + if ($previous ne substr($bhead[$brule_num],0,1)) { + $bfirst{substr($bhead[$brule_num],0,1)} = $brule_num; + $blast{$previous} = $brule_num - 1; + } + $previous = substr($bhead[$brule_num++],0,1); + } else { + print "Rule format error in file basic_rules: Cannot parse $_\n"; + exit(1); + } + } +} +$blast{$previous} = $brule_num - 1; +close(BF); + +if ($brule_num == 0) { + print "No basic rules, Program exit!\n"; + exit(1); +} + +while(){ + next if ((/^#/) || (/^\s*$/) ); + chop; + if ($print_input) { + print $_, "\t"; + } + if ($pron{$_}) { + # print answer from preferences and skip to next word + print "$pron{$_}\t$stre{$_}\n"; + next; + } + $original = $_; + tr/A-ZÁÉÍÓÚÏÜÑ/a-záéíóúïüñ/; + $orig = "#" . $_ . "#"; + + @l = (); + + push(@l,split("",$orig)); + + @pron = &transfer(1); + + foreach (@pron) { + $a = $_; + y/aeiouáéíóú//cd; + if ($_ eq "") { + print "#No stressable vowel in $original\n"; + } else { + s/[aeiou]/0/go; + s/[áéíóú]/1/go; + if (!/1/) { + if(length() == 1){ + s/\b./1/o; + } elsif($l[$#l - 1] =~ /[aeiouns]/o){ + s/00\b/10/o; + } else { + s/0\b/1/o; + } + } + + $a =~ s/á/a/g; + $a =~ s/é/e/g; + $a =~ s/í/i/g; + $a =~ s/ó/o/g; + $a =~ s/ú/u/g; + + print "$a\t$_\n"; + } + } +} + +sub transfer{ + local($_) = @_; + local(@p) = (); + local($s) = 0; + local($over) = 0; + local($i,$j,$k) = (0,0,0); + + if ($_ >= length($orig) - 1) { + push(@p, ""); + return(@p); + } else { + + if ($vfile ne "") { + for ($i= $first{substr($orig, $_, 1)}; + $i <= $last{substr($orig, $_, 1)} ; $i++) { + if (&matchv($_,$i)) { + $s = $_ + length($head[$i]); + foreach $w (&transfer($s)) { + push(@p, $end[$i] . $w); + if ($some[$i] ne "") { + $over = 0; + } else { + $over = 1; + } + } + } + } + } + + if ($over == 0 ) { + $i = $bfirst{substr($orig, $_, 1)}; + while (($i <= $blast{substr($orig, $_, 1)}) && ($over == 0)) { + if (&matchb($_,$i)) { + $over = 1; + $s = $_ + length($bhead[$i]); + foreach $w (&transfer($s)) { + push(@p, $bend[$i] . $w); + } + } + $i++; + } + if ($over == 0) { + $s = $_ + 1; + foreach $w (&transfer($s)) { + push(@p, substr($orig,$_,1) . $w); + } + } + } + + return(@p); + } +} + +sub matchv { + $h = $head[$_[1]]; + $p = $pre[$_[1]]; + $n = $nex[$_[1]]; + + return(&match($_[0],$h,$p,$n)); + +} + +sub matchb { + $h = $bhead[$_[1]]; + $p = $bpre[$_[1]]; + $n = $bnex[$_[1]]; + + return(&match($_[0],$h,$p,$n)); + +} + +sub match { + + if (substr($orig, $_[0], length($_[1])) eq $_[1]) { + return ( &match_n($_[0] + length($_[1]) - 1, $_[3]) && + &match_p($_[0], $_[2])); + } else { + return (0); + } +} + +sub match_p { + local($a) = $_[0]; + local($b) = $_[1]; + local($_); + + if ($b eq "" ) { + return (1); + } else { + $_ = substr($orig, 0, $a) . "!"; + if (/($b)!/) { + return(1); + } else { + return(0); + } + } +} + +sub match_n { + local($a) = $_[0]; + local($b) = $_[1]; + local($_); + + if ($b eq "" ) { + return (1); + } else { + $_ = "!" . substr($orig, $a + 1, length($orig) - $a - 1); + if (/!($b)/) { + return(1); + } else { + return(0); + } + } +} diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh new file mode 100755 index 00000000000..9f5855d56c4 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# The input is a subset of the dataset in use. (*.sph files) +# In addition the transcripts are needed as well. +# This script is only called internally and should not be +# used for any other purpose. A similar script for general usage +# is local/fsp_data_prep.sh +# To be run from one directory above this script. + +stage=0 + +export LC_ALL=C + + +if [ $# -lt 4 ]; then + echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories and the name of this partition +, and a list of files that belong to this partition . see ../run.sh for example." + exit 1; +fi + +subset=$3 +dir=`pwd`/data/local/$subset/data +mkdir -p $dir +local=`pwd`/local +utils=`pwd`/utils +tmpdir=`pwd`/data/local/tmp +mkdir -p $tmpdir + +. ./path.sh || exit 1; # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi +cd $dir + +# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command +# line arguments being absolute pathnames. +rm -r links/ 2>/dev/null +mkdir links/ +mkdir links/speech +mkdir links/transcripts +if [ ! -f $4 ]; then + echo "Please specify a valid parition file. Could not find $4" + exit 1; +fi +cat $4 | sed 's:.*/::g' | \ +xargs -I % find $1/ -name %* | xargs -I % echo cp % links/ + +# Basic spot checks to see if we got the data that we needed +if [ ! -d links/LDC2010S01 -o ! -d links/LDC2010T04 ]; +then + echo "The speech and the data directories need to be named LDC2010S01 and LDC2010T04 respecti +vely" + exit 1; +fi + +if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ]; +then + echo "Disc 1 and 2 directories missing or not properly organised within the speech data dir" + echo "Typical format is LDC2010S01/DISC?/data/speech" + exit 1; +fi + +#Check the transcripts directories as well to see if they exist +if [ ! -d links/LDC2010T04/data/transcripts ]; +then + echo "Transcript directories missing or not properly organised" + echo "Typical format is LDC2010T04/data/transcripts" + exit 1; +fi + +speech_d1=$dir/links/LDC2010S01/DISC1/data/speech +speech_d2=$dir/links/LDC2010S01/DISC2/data/speech +transcripts=$dir/links/LDC2010T04/data/transcripts + +fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` +fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` +fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` +#TODO:it seems like not all speech files have transcripts +#Now check if we got all the files that we needed +if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; +then + echo "Incorrect number of files in the data directories" + echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively" + echo "The transcripts should contain 819 files" + exit 1; +fi + +if [ $stage -le 0 ]; then + #Gather all the speech files together to create a file list + #TODO: Train and test split might be required + ( + find $speech_d1 -iname '*.sph'; + find $speech_d2 -iname '*.sph'; + ) > $tmpdir/train_sph.flist + + #Get all the transcripts in one place + find $transcripts -iname '*.tdf' > $tmpdir/train_transcripts.flist +fi + +if [ $stage -le 1 ]; then + $local/fsp_make_trans.pl $tmpdir + mkdir -p $dir/train_all + mv $tmpdir/reco2file_and_channel $dir/train_all/ +fi + +if [ $stage -le 2 ]; then + sort $tmpdir/text.1 | grep -v '((' | \ + awk '{if (NF > 1){ print; }}' | \ + sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \ + sed 's:\([^<]*\)<\/lname>:\1:g' | \ + sed 's:::g' | \ + sed 's:[^<]*<\/laugh>:[laughter]:g' | \ + sed 's:<\s*cough[\/]*>:[noise]:g' | \ + sed 's::[noise]:g' | \ + sed 's::[noise]:g' | \ + sed 's::[noise]:g' | \ + sed 's:[^<]*<\/background>:[noise]:g' | \ + sed -r 's:<[/]?background[/]?>:[noise]:g' | \ + #One more time to take care of nested stuff + sed 's:[^<]*<\/laugh>:[laughter]:g' | \ + sed -r 's:<[/]?laugh[/]?>:[laughter]:g' | \ + #now handle the exceptions, find a cleaner way to do this? + sed 's:::g' | \ + sed 's:::g' | \ + sed 's:foreign>::g' | \ + sed 's:>::g' | \ + #How do you handle numbers? + grep -v '()' | \ + #Now go after the non-printable characters + sed -r 's:¿::g' > $tmpdir/text.2 + cp $tmpdir/text.2 $dir/train_all/text + + #Create segments file and utt2spk file + ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \ + && echo "Error producing utt2spk file" && exit 1; + + cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; + $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' >$dir/train_all/segments + + $utils/utt2spk_to_spk2utt.pl <$dir/train_all/utt2spk > $dir/train_all/spk2utt +fi + +if [ $stage -le 3 ]; then + cat $tmpdir/train_sph.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp + cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \ + sort -k1,1 -u > $dir/train_all/wav.scp || exit 1; +fi + +if [ $stage -le 4 ]; then + # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl. + cat $tmpdir/spk2gendertmp | sort | uniq > $dir/train_all/spk2gender +fi + +echo "Fisher Spanish Data preparation succeeded." + +exit 1; + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py new file mode 100755 index 00000000000..ce83fa8c8aa --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +import os +import sys + +files = [ +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/exp/tri5a/decode_test/scoring/13.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/exp/tri5a/decode_test/scoring/13.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/exp/tri5a/decode_test/scoring/13.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/exp/tri5a/decode_test/scoring/13.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/exp/tri5a/decode_test/scoring/13.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/exp/tri5a/decode_test/scoring/13.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/exp/tri5a/decode_test/scoring/13.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/exp/tri5a/decode_test/scoring/13.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/exp/tri5a/decode_test/scoring/13.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/exp/tri5a/decode_test/scoring/13.tra')] + +def findTranscription(timeDetail): + + for file1 in files: + file1.seek(0,0) + for line in file1: + lineComp = line.split() + if lineComp[0] == timeDetail: + return " ".join(lineComp[1:]) + # No result found + return -1 + + +wordsFile = open('exp/tri5a/graph/words.txt') +words = {} + +# Extract word list +for line in wordsFile: + lineComp = line.split() + words[int(lineComp[1])] = lineComp[0].strip() + +# Now read list of files in conversations +fileList = [] +#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev') +conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train') +for line in conversationList: + line = line.strip() + line = line[:-4] + fileList.append(line) + +# IN what order were the conversations added to the spanish files? +# TODO: Make sure they match the order in which these english files are being written + +# Now get timing information to concatenate the ASR outputs +if not os.path.exists('exp/tri5a/one-best/train'): + os.makedirs('exp/tri5a/one-best/train') + +#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+') +provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train', 'w+') +for item in fileList: + timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es') + newFile = open('exp/tri5a/one-best/train/' + item + '.es', 'w+') + for line in timingFile: + timeInfo = line.split() + mergedTranslation = "" + for timeDetail in timeInfo: + #Locate this in ASR dev/test, this is going to be very slow + tmp = findTranscription(timeDetail) + if tmp != -1: + mergedTranslation = mergedTranslation + " " + tmp + mergedTranslation = mergedTranslation.strip() + transWords = [words[int(x)] for x in mergedTranslation.split()] + newFile.write(" ".join(transWords) + "\n") + provFile.write(" ".join(transWords) + "\n") + newFile.close() +provFile.close() + + + + + + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py new file mode 100755 index 00000000000..b9f906b27da --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +from __future__ import print_function +import os +import sys +import subprocess + +latticeLocation = {1:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/latjosh-2/lattices-pushed/", +2:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/latjosh-2/lattices-pushed/", +3:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/latjosh-2/lattices-pushed/", +4:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/latjosh-2/lattices-pushed/", +5:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/latjosh-2/lattices-pushed/", +6:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/latjosh-2/lattices-pushed/", +7:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/latjosh-2/lattices-pushed/", +8:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/latjosh-2/lattices-pushed/", +9:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/latjosh-2/lattices-pushed/", +10:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/latjosh-2/lattices-pushed/"} + +latticeDict = {} + +for key,location in latticeLocation.items(): + for root, dirs, filenames in os.walk(location): + for f in filenames: + latticeDict[f] = str(key) + +tmpdir = 'data/local/data/tmp/lattmp' +if not os.path.exists(tmpdir): + os.makedirs(tmpdir) +invalidplfdir = 'data/local/data/tmp/invalidplf' +if not os.path.exists(invalidplfdir): + os.makedirs(invalidplfdir) +else: + os.system("rm " + invalidplfdir + "/*") + +def latticeConcatenate(lat1, lat2): + ''' + Concatenates lattices, writes temporary results to tmpdir + ''' + if lat1 == "": + if os.path.exists('rm ' + tmpdir + '/tmp.lat'): + os.system('rm ' + tmpdir + '/tmp.lat') + return lat2 + else: + proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')]) + proc.wait() + return tmpdir + '/tmp.lat' + + +def findLattice(timeDetail): + ''' + Finds the lattice corresponding to a time segment + ''' + searchKey = timeDetail + '.lat' + if searchKey in latticeDict: + return "/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-" + latticeDict[searchKey] + "/latjosh-2/lattices-pushed/" + searchKey + else: + return -1 + + +# Now read list of files in conversations +fileList = [] +conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train') +for line in conversationList: + line = line.strip() + line = line[:-4] + fileList.append(line) + +# IN what order were the conversations added to the spanish files? +# Now get timing information to concatenate the ASR outputs + +provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train.plf', 'w+') +lineNo = 1 +invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/invalidPLF', 'w+') +blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/blankPLF', 'w+') +rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/removeLines', 'w+') +for item in fileList: + timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es') + for line in timingFile: + timeInfo = line.split() + + # For utterances that are concatenated in the translation file, + # the corresponding FSTs have to be translated as well + mergedTranslation = "" + for timeDetail in timeInfo: + tmp = findLattice(timeDetail) + if tmp != -1: + # Concatenate lattices + mergedTranslation = latticeConcatenate(mergedTranslation, tmp) + + if mergedTranslation != "": + + # Sanjeev's Recipe : Remove epsilons and topo sort + finalFST = tmpdir + "/final.fst" + os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST) + + # Now convert to PLF + proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh /export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt ' + finalFST, stdout=subprocess.PIPE, shell=True) + PLFline = proc.stdout.readline() + finalPLFFile = tmpdir + "/final.plf" + finalPLF = open(finalPLFFile, "w+") + finalPLF.write(PLFline) + finalPLF.close() + + # now check if this is a valid PLF, if not write it's ID in a + # file so it can be checked later + proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) + line = proc.stdout.readline() + print("{} {}".format(line, lineNo)) + if line.strip() != "PLF format appears to be correct.": + os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) + invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") + rmLines.write("{}\n".format(lineNo)) + else: + provFile.write(PLFline) + else: + blankPLF.write(timeInfo[0] + "\n") + rmLines.write("{}\n".format(lineNo)) + # Now convert to PLF + lineNo += 1 + +provFile.close() +invalidPLF.close() +blankPLF.close() +rmLines.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh new file mode 100755 index 00000000000..29fbeebace6 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +stage=-2 +num_words_pocolm=110000 +prune_size=1000000 + +. ./path_venv.sh +. ./cmd.sh +. ./utils/parse_options.sh + +set -euo pipefail + +export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P) +export PATH=$PATH:$POCOLM_ROOT/scripts + +textdir=$1 +pocolm_dir=$2 + + +if [ $stage -le -2 ];then + if [ -e "$textdir"/unigram_weights ]; then + rm "$textdir"/unigram_weights + fi + + if [ -e "$pocolm_dir" ]; then + rm -r "$pocolm_dir" + fi + + bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \ + --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" +fi + +if [ $stage -le -1 ];then + prune_lm_dir.py --target-num-ngrams=${prune_size} --max-memory=8G "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned + format_arpa_lm.py --max-memory=8G "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned | gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3.pocolm_pruned_${prune_size}.arpa.gz +fi + + +exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py new file mode 100755 index 00000000000..3f6444da294 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# Copyright 2014 Gaurav Kumar. Apache 2.0 + +import os +import sys + +files = [ +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/exp/tri5a/decode_test/oracle/oracle.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/exp/tri5a/decode_test/oracle/oracle.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/exp/tri5a/decode_test/oracle/oracle.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/exp/tri5a/decode_test/oracle/oracle.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/exp/tri5a/decode_test/oracle/oracle.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/exp/tri5a/decode_test/oracle/oracle.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/exp/tri5a/decode_test/oracle/oracle.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/exp/tri5a/decode_test/oracle/oracle.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/exp/tri5a/decode_test/oracle/oracle.tra'), +open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/exp/tri5a/decode_test/oracle/oracle.tra')] + +def findTranscription(timeDetail): + + for file1 in files: + file1.seek(0,0) + for line in file1: + lineComp = line.split() + if lineComp[0] == timeDetail: + return " ".join(lineComp[1:]) + # No result found + return -1 + + +wordsFile = open('exp/tri5a/graph/words.txt') +words = {} + +# Extract word list +for line in wordsFile: + lineComp = line.split() + words[int(lineComp[1])] = lineComp[0].strip() + +# Now read list of files in conversations +fileList = [] +#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev') +conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train') +for line in conversationList: + line = line.strip() + line = line[:-4] + fileList.append(line) + +# IN what order were the conversations added to the spanish files? +# TODO: Make sure they match the order in which these english files are being written + +# Now get timing information to concatenate the ASR outputs +if not os.path.exists('exp/tri5a/one-best/train'): + os.makedirs('exp/tri5a/one-best/train') + +#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+') +provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train.oracle', 'w+') +for item in fileList: + timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es') + newFile = open('exp/tri5a/one-best/train/' + item + '.es', 'w+') + for line in timingFile: + timeInfo = line.split() + mergedTranslation = "" + for timeDetail in timeInfo: + #Locate this in ASR dev/test, this is going to be very slow + tmp = findTranscription(timeDetail) + if tmp != -1: + mergedTranslation = mergedTranslation + " " + tmp + mergedTranslation = mergedTranslation.strip() + transWords = [words[int(x)] for x in mergedTranslation.split()] + newFile.write(" ".join(transWords) + "\n") + provFile.write(" ".join(transWords) + "\n") + newFile.close() +provFile.close() + + + + + + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter b/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter new file mode 100755 index 00000000000..4fce42945b3 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter @@ -0,0 +1,5 @@ +#!/bin/sed -f +s:\[laughter\]::g +s:\[noise\]::g +s:\[oov\]::g +s:::g diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh new file mode 100755 index 00000000000..2fc3de37406 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh @@ -0,0 +1,13 @@ +export KALDI_ROOT=`pwd`/../../../../kaldi +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LD_LIBRARY_PATH=/home/dpovey/libs + +export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk +export PATH=$SPARROWHAWK_ROOT/bin:$PATH +export LC_ALL=C.UTF-8 +export LANG=C.UTF-8 + + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh b/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh new file mode 100755 index 00000000000..80edbbaf69a --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh @@ -0,0 +1,13 @@ +export KALDI_ROOT=`pwd`/../../../../kaldi +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LD_LIBRARY_PATH=/home/dpovey/libs + +export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk +export PATH=$SPARROWHAWK_ROOT/bin:$PATH +export LC_ALL=C.UTF-8 +export LANG=C.UTF-8 + +source ~/anaconda/bin/activate py36 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/rnnlm b/egs/fisher_callhome_spanish/s5_gigaword/rnnlm new file mode 120000 index 00000000000..fb754622d5e --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/rnnlm @@ -0,0 +1 @@ +../../wsj/s5/rnnlm \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh new file mode 100755 index 00000000000..5f7068072f3 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -0,0 +1,299 @@ +#!/bin/bash +# +# Copyright 2018 Nagendra Goel, Saikiran Valluri Apache 2.0 +# Copyright 2014 Gaurav Kumar. Apache 2.0 +# Recipe for Fisher/Callhome-Spanish + +stage=-1 +lmstage=-2 +train_sgmm2=false + +# call the next line with the directory where the Spanish Fisher data is +# (the values below are just an example). +sfisher_speech=/export/corpora/LDC/LDC2010S01 +sfisher_transcripts=/export/corpora/LDC/LDC2010T04 +spanish_lexicon=/export/corpora/LDC/LDC96L16 +split=local/splits/split_fisher + +callhome_speech=/export/corpora/LDC/LDC96S35 +callhome_transcripts=/export/corpora/LDC/LDC96T17 +split_callhome=local/splits/split_callhome + +gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data # Path to the download of Gigaword data +rnnlm_workdir=/export/c03/svalluri/workdir_rnnlm # Work path for entire Gigaword LM and text processing, should be + # large free spae and easy IO access. +mfccdir=`pwd`/mfcc + +. ./cmd.sh +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +set -eou pipefail + +if [ $stage -le -1 ]; then + local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts + local/callhome_data_prep.sh $callhome_speech $callhome_transcripts + + # The lexicon is created using the LDC spanish lexicon, the words from the + # fisher spanish corpus. Additional (most frequent) words are added from the + # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted + # wordlist is downloaded if it is not available. + local/fsp_prepare_dict.sh $spanish_lexicon + + # Added c,j, v to the non silences phones manually + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + + # Make sure that you do not use your test and your dev sets to train the LM + # Some form of cross validation is possible where you decode your dev/set based on an + # LM that is trained on everything but that that conversation + # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl + # to get the numbers. Depending on your needs, you might have to change the size of + # the splits within that file. The default paritions are based on the Kaldi + Joshua + # requirements which means that I have very large dev and test sets + local/fsp_train_lms.sh $split + local/fsp_create_test_lang.sh + + utils/fix_data_dir.sh data/local/data/train_all + + steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1; + + utils/fix_data_dir.sh data/local/data/train_all + utils/validate_data_dir.sh data/local/data/train_all + + cp -r data/local/data/train_all data/train_all + + # For the CALLHOME corpus + utils/fix_data_dir.sh data/local/data/callhome_train_all + + steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/callhome_train_all exp/make_mfcc/callhome_train_all $mfccdir || exit 1; + + utils/fix_data_dir.sh data/local/data/callhome_train_all + utils/validate_data_dir.sh data/local/data/callhome_train_all + + cp -r data/local/data/callhome_train_all data/callhome_train_all + + # Creating data partitions for the pipeline + # We need datasets for both the ASR and SMT system + # We have 257455 utterances left, so the partitions are roughly as follows + # ASR Train : 100k utterances + # ASR Tune : 17455 utterances + # ASR Eval : 20k utterances + # MT Train : 100k utterances + # MT Tune : Same as the ASR eval set (Use the lattices from here) + # MT Eval : 20k utterances + # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker + # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. + # As noted above, the LM has not been trained on the dev and the test sets. + #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test + #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test + #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test + #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev + #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test + #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train + #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test + #rm -r data/dev_and_test + #rm -r data/asr_dev_and_test + #rm -r data/mt_train_and_test + + local/create_splits.sh $split + local/callhome_create_splits.sh $split_callhome +fi + +if [ $stage -le 0 ]; then + mkdir -p "$rnnlm_workdir"/gigaword_rawtext + local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath" "$rnnlm_workdir"/flattened_gigaword_corpus 24 + cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt + local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/ \ + "$rnnlm_workdir"/normalised_gigaword_corpus/ + mkdir -p "$rnnlm_workdir"/text_lm + cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt + cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt # For RNNLM and POCOLM training we use dev2/text as dev file. + cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt +fi + + +if [ $stage -le 1 ]; then + num_words_pocolm=110000 + local/train_pocolm.sh --stage $lmstage --num-words-pocolm 110000 "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm + cat "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt > "$rnnlm_workdir"/rnnlm_wordlist.txt + cut -f 1 -d " " data/lang/words.txt >> "$rnnlm_workdir"/rnnlm_wordlist.txt + cat "$rnnlm_workdir"/rnnlm_wordlist.txt | sort | uniq > "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq + local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/100000_3.pocolm \ + --wordslist "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq --text "$rnnlm_workdir"/text_lm --text-dir "$rnnlm_workdir"/text_lm +fi + +if [ $stage -le 2 ]; then + # Now compute CMVN stats for the train, dev and test subsets + steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir + steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir + steps/compute_cmvn_stats.sh data/dev2 exp/make_mfcc/dev2 $mfccdir + #steps/compute_cmvn_stats.sh data/mt_train exp/make_mfcc/mt_train $mfccdir + #steps/compute_cmvn_stats.sh data/mt_test exp/make_mfcc/mt_test $mfccdir + + #n=$[`cat data/train_all/segments | wc -l` - 158126] + #utils/subset_data_dir.sh --last data/train_all $n data/train + steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir + + steps/compute_cmvn_stats.sh data/callhome_dev exp/make_mfcc/callhome_dev $mfccdir + steps/compute_cmvn_stats.sh data/callhome_test exp/make_mfcc/callhome_test $mfccdir + steps/compute_cmvn_stats.sh data/callhome_train exp/make_mfcc/callhome_train $mfccdir + + # Again from Dan's recipe : Reduced monophone training data + # Now-- there are 1.6 million utterances, and we want to start the monophone training + # on relatively short utterances (easier to align), but not only the very shortest + # ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random + # utterances from those. + + utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort + utils/subset_data_dir.sh data/train_100kshort 10000 data/train_10k + utils/data/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup + utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k + utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k +fi + +if [ $stage -le 3 ]; then + steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_10k_nodup data/lang exp/mono0a + + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1; + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1; + + + (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri1/graph data/dev exp/tri1/decode_dev)& + + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1; + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1; + + ( + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1; + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1; + )& +fi + +if [ $stage -le 4 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1; + +# Train tri3a, which is LDA+MLLT, on 100k data. + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + 3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1; + ( + utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; + )& +fi + +if [ $stage -le 5 ]; then +# Next we'll use fMLLR and train with SAT (i.e. on +# fMLLR features) + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1; + + steps/train_sat.sh --cmd "$train_cmd" \ + 4000 60000 data/train_100k data/lang exp/tri3a_ali exp/tri4a || exit 1; + + ( + utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri4a/graph data/dev exp/tri4a/decode_dev +)& + + + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train data/lang exp/tri4a exp/tri4a_ali || exit 1; + +# Reduce the number of gaussians + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 120000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; + + ( + utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/dev exp/tri5a/decode_dev + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/test exp/tri5a/decode_test + + # Decode CALLHOME + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train + ) & + + + steps/align_fmllr.sh \ + --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \ + data/train data/lang exp/tri5a exp/tri5a_ali +fi + +if $train_sgmm2; then + +steps/train_ubm.sh \ + --cmd "$train_cmd" 750 \ + data/train data/lang exp/tri5a_ali exp/ubm5 + +steps/train_sgmm2.sh \ + --cmd "$train_cmd" 5000 18000 \ + data/train data/lang exp/tri5a_ali exp/ubm5/final.ubm exp/sgmm5 + +utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph + +( + steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \ + --config conf/decode.config --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \ + exp/sgmm5/graph data/dev exp/sgmm5/decode_dev +)& + +steps/align_sgmm2.sh \ + --nj 32 --cmd "$train_cmd" --transform-dir exp/tri5a_ali \ + --use-graphs true --use-gselect true \ + data/train data/lang exp/sgmm5 exp/sgmm5_ali + +steps/make_denlats_sgmm2.sh \ + --nj 32 --sub-split 32 --num-threads 4 \ + --beam 10.0 --lattice-beam 6 --cmd "$decode_cmd" --transform-dir exp/tri5a_ali \ + data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats + +steps/train_mmi_sgmm2.sh \ + --cmd "$train_cmd" --drop-frames true --transform-dir exp/tri5a_ali --boost 0.1 \ + data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \ + exp/sgmm5_mmi_b0.1 + +( +utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph +steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \ + --config conf/decode.config --scoring-opts "--min-lmwt 8 --max-lmwt 12"\ + exp/tri5a/graph data/dev exp/tri5a/decode_dev +utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph +steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \ + --config conf/decode.config --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \ + exp/sgmm5/graph data/dev exp/sgmm5/decode_dev +for iter in 1 2 3 4; do + decode=exp/sgmm5_mmi_b0.1/decode_dev_it$iter + mkdir -p $decode + steps/decode_sgmm2_rescore.sh \ + --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5a/decode_dev \ + data/lang_test data/dev/ exp/sgmm5/decode_dev $decode +done +) & +fi + +wait; + +if [ $stage -le 6 ]; then + local/chain/run_tdnn_1g.sh || exit 1; +fi +exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/steps b/egs/fisher_callhome_spanish/s5_gigaword/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5_gigaword/utils b/egs/fisher_callhome_spanish/s5_gigaword/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file From e8aecbb584d05eb0b4cad22d3d57a59b0a20a8d5 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Tue, 19 Feb 2019 10:47:15 +0530 Subject: [PATCH 02/49] Some bug fixes --- egs/fisher_callhome_spanish/s5_gigaword/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 5f7068072f3..89e8fbd434b 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -118,8 +118,8 @@ if [ $stage -le 1 ]; then cat "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt > "$rnnlm_workdir"/rnnlm_wordlist.txt cut -f 1 -d " " data/lang/words.txt >> "$rnnlm_workdir"/rnnlm_wordlist.txt cat "$rnnlm_workdir"/rnnlm_wordlist.txt | sort | uniq > "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq - local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/100000_3.pocolm \ - --wordslist "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq --text "$rnnlm_workdir"/text_lm --text-dir "$rnnlm_workdir"/text_lm + local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \ + --wordslist "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq --text-dir "$rnnlm_workdir"/text_lm fi if [ $stage -le 2 ]; then From ece34bd064bfbdcae7b655552057469c5d47b0b2 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Tue, 19 Feb 2019 10:48:44 +0530 Subject: [PATCH 03/49] Update rnnlm.sh --- egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh index aa06fdbb293..3850910f312 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh @@ -21,7 +21,6 @@ lstm_rpd=256 lstm_nrpd=256 stage=0 train_stage=-30 -text=Spanish_gigawrd/text_lm text_dir=Spanish_gigawrd/text_lm . ./cmd.sh @@ -30,7 +29,7 @@ text_dir=Spanish_gigawrd/text_lm mkdir -p $dir/config set -e -for f in $text/dev.txt; do +for f in $text_dir/dev.txt; do [ ! -f $f ] && \ echo "$0: expected file $f to exist;" && exit 1 done From 0c4fe470684751a54e4def8600dde847b8507cd5 Mon Sep 17 00:00:00 2001 From: saikiran valluri Date: Tue, 19 Feb 2019 01:27:47 -0500 Subject: [PATCH 04/49] Combining lexicon words with pocolm wordslist for RNNLM training --- .../s5_gigaword/local/get_rnnlm_wordlist.py | 32 ++++++++++++++ .../s5_gigaword/local/rnnlm.sh | 3 +- .../s5_gigaword/run.sh | 42 ++++--------------- 3 files changed, 42 insertions(+), 35 deletions(-) create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py new file mode 100755 index 00000000000..d6ddfbecc14 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc. + +import os, sys + +if len(sys.argv) < 4: + print( "Usage: python get_rnnlm_wordlist.py ") + sys.exit() + +lexicon_words = open(sys.argv[1], 'r') +pocolm_words = open(sys.argv[2], 'r') +rnnlm_wordsout = open(sys.argv[3], 'w') + +line_count=0 +lexicon=[] + +for line in lexicon_words: + lexicon.append(line.split()[0]) + rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n') + line_count = line_count + 1 + +for line in pocolm_words: + if not line.split()[0] in lexicon: + rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n') + line_count = line_count + 1 + +lexicon_words.close() +pocolm_words.close() +rnnlm_wordsout.close() + diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh index aa06fdbb293..3850910f312 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh @@ -21,7 +21,6 @@ lstm_rpd=256 lstm_nrpd=256 stage=0 train_stage=-30 -text=Spanish_gigawrd/text_lm text_dir=Spanish_gigawrd/text_lm . ./cmd.sh @@ -30,7 +29,7 @@ text_dir=Spanish_gigawrd/text_lm mkdir -p $dir/config set -e -for f in $text/dev.txt; do +for f in $text_dir/dev.txt; do [ ! -f $f ] && \ echo "$0: expected file $f to exist;" && exit 1 done diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 5f7068072f3..80c0debfb12 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -19,9 +19,8 @@ callhome_speech=/export/corpora/LDC/LDC96S35 callhome_transcripts=/export/corpora/LDC/LDC96T17 split_callhome=local/splits/split_callhome -gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data # Path to the download of Gigaword data -rnnlm_workdir=/export/c03/svalluri/workdir_rnnlm # Work path for entire Gigaword LM and text processing, should be - # large free spae and easy IO access. +gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data +rnnlm_workdir=/export/c03/svalluri/workdir_rnnlm mfccdir=`pwd`/mfcc . ./cmd.sh @@ -31,8 +30,9 @@ if [ -f path.sh ]; then . ./path.sh; fi set -eou pipefail if [ $stage -le -1 ]; then - local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts - local/callhome_data_prep.sh $callhome_speech $callhome_transcripts +# local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts + +# local/callhome_data_prep.sh $callhome_speech $callhome_transcripts # The lexicon is created using the LDC spanish lexicon, the words from the # fisher spanish corpus. Additional (most frequent) words are added from the @@ -72,29 +72,6 @@ if [ $stage -le -1 ]; then cp -r data/local/data/callhome_train_all data/callhome_train_all - # Creating data partitions for the pipeline - # We need datasets for both the ASR and SMT system - # We have 257455 utterances left, so the partitions are roughly as follows - # ASR Train : 100k utterances - # ASR Tune : 17455 utterances - # ASR Eval : 20k utterances - # MT Train : 100k utterances - # MT Tune : Same as the ASR eval set (Use the lattices from here) - # MT Eval : 20k utterances - # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker - # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. - # As noted above, the LM has not been trained on the dev and the test sets. - #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test - #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test - #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test - #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev - #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test - #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train - #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test - #rm -r data/dev_and_test - #rm -r data/asr_dev_and_test - #rm -r data/mt_train_and_test - local/create_splits.sh $split local/callhome_create_splits.sh $split_callhome fi @@ -115,11 +92,10 @@ fi if [ $stage -le 1 ]; then num_words_pocolm=110000 local/train_pocolm.sh --stage $lmstage --num-words-pocolm 110000 "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm - cat "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt > "$rnnlm_workdir"/rnnlm_wordlist.txt - cut -f 1 -d " " data/lang/words.txt >> "$rnnlm_workdir"/rnnlm_wordlist.txt - cat "$rnnlm_workdir"/rnnlm_wordlist.txt | sort | uniq > "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq - local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/100000_3.pocolm \ - --wordslist "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq --text "$rnnlm_workdir"/text_lm --text-dir "$rnnlm_workdir"/text_lm + local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ + "$rnnlm_workdir"/rnnlm_wordlist + local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \ + --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm fi if [ $stage -le 2 ]; then From 1439b0dd9d0d2ae527e0ddd14c6a4b39c7bd7075 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Sun, 24 Feb 2019 01:54:54 -0500 Subject: [PATCH 05/49] Integrated the 2 stage scientific method POCOLM training for Gigaword corpus --- .../s5_gigaword/cmd.sh | 2 +- .../local/get_unigram_weights_vocab.py | 33 +++++++++++++++++++ .../s5_gigaword/local/pocolm_cust.sh | 7 ++-- .../s5_gigaword/local/train_pocolm.sh | 26 +++++++++++---- .../s5_gigaword/run.sh | 5 ++- 5 files changed, 62 insertions(+), 11 deletions(-) create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py diff --git a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh index 0511bd2bbb0..db97f1fbc6f 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="retry.pl queue.pl" +export train_cmd="retry.pl queue.pl --mem 8G" export decode_cmd="retry.pl queue.pl --mem 8G" export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py new file mode 100644 index 00000000000..43cf8392167 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc. + +import os, sys + +if len(sys.argv) < 3: + print("Usage : python . ") + print(" Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.") + sys.exit() + +pocolmdir=sys.argv[1] +unigramwts=open(sys.argv[2], 'w') + +names = open(pocolmdir+"/names", 'r') +metaparams = open(pocolmdir+"/metaparameters", 'r') + +name_mapper={} +for line in names: + fields=line.split() + name_mapper[fields[0]] = fields[1] + +lns = metaparams.readlines() +for lineno in range(len(name_mapper.keys())): + line = lns[lineno] + fileid = line.split()[0].split("_")[-1] + weight = line.split()[1] + unigramwts.write(name_mapper[fileid] + " " + weight + "\n") + +names.close() +unigramwts.close() +metaparams.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh index a3b2d77d860..c6642f6fcf4 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh @@ -13,6 +13,8 @@ export PATH=$PATH:$POCOLM_ROOT/scripts wordlist=None num_word=100000 +pocolm_stage=2 +ngram_order=3 lm_dir= arpa_dir= textdir= @@ -55,7 +57,7 @@ limit_unk_history_opt= # un-comment the following line #limit_unk_history_opt="--limit-unk-history=true" -for order in 3; do +for order in ${ngram_order}; do # decide on the vocabulary. # Note: you'd use --wordlist if you had a previously determined word-list # that you wanted to use. @@ -72,6 +74,7 @@ for order in 3; do --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \ ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir} + if [ $pocolm_stage -eq 2 ];then mkdir -p ${arpa_dir} format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz @@ -93,7 +96,7 @@ for order in 3; do get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity' format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz - + fi done # (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 ) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh index 29fbeebace6..8ceb08f281a 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh @@ -17,22 +17,34 @@ textdir=$1 pocolm_dir=$2 -if [ $stage -le -2 ];then +if [ $stage -le -2 ]; then + echo "\n\n" + echo " POCOLM experiment : Runnning STAGE 1 : 2-gram Pocolm general closed vocabulary model" + echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....." + echo "\n\n" if [ -e "$textdir"/unigram_weights ]; then rm "$textdir"/unigram_weights fi - if [ -e "$pocolm_dir" ]; then rm -r "$pocolm_dir" fi + + bash local/pocolm_cust.sh --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \ + --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" - bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \ - --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" fi - + if [ $stage -le -1 ];then - prune_lm_dir.py --target-num-ngrams=${prune_size} --max-memory=8G "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned - format_arpa_lm.py --max-memory=8G "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned | gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3.pocolm_pruned_${prune_size}.arpa.gz + echo "\n\n" + echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....." + echo "\n\n" + + echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done + python local/get_unigramwts.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights + bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \ + --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" + + fi diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 80c0debfb12..6e2ee9d4f25 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -20,7 +20,7 @@ callhome_transcripts=/export/corpora/LDC/LDC96T17 split_callhome=local/splits/split_callhome gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data -rnnlm_workdir=/export/c03/svalluri/workdir_rnnlm +rnnlm_workdir=/export/c03/svalluri/workdir_pocolm_2stage mfccdir=`pwd`/mfcc . ./cmd.sh @@ -94,6 +94,9 @@ if [ $stage -le 1 ]; then local/train_pocolm.sh --stage $lmstage --num-words-pocolm 110000 "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ "$rnnlm_workdir"/rnnlm_wordlist +fi + +if [ $stage -le 2 ]; then local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \ --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm fi From 8ad0e0130c011fef22f583d1ca60e0c0d6f856a0 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Tue, 26 Feb 2019 05:17:14 +0000 Subject: [PATCH 06/49] Update train_pocolm.sh --- .../s5_gigaword/local/train_pocolm.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh index 8ceb08f281a..c8adb79383e 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh @@ -18,10 +18,10 @@ pocolm_dir=$2 if [ $stage -le -2 ]; then - echo "\n\n" - echo " POCOLM experiment : Runnning STAGE 1 : 2-gram Pocolm general closed vocabulary model" + echo "****" + echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model" echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....." - echo "\n\n" + echo "****" if [ -e "$textdir"/unigram_weights ]; then rm "$textdir"/unigram_weights fi @@ -35,12 +35,12 @@ if [ $stage -le -2 ]; then fi if [ $stage -le -1 ];then - echo "\n\n" + echo "********" echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....." - echo "\n\n" + echo "********" echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done - python local/get_unigramwts.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights + python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \ --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" From f856ac2c4cd0da3c7df4aab65a1eace387dd60b7 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Wed, 27 Feb 2019 15:36:32 +0530 Subject: [PATCH 07/49] Update run.sh --- egs/fisher_callhome_spanish/s5_gigaword/run.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 6e2ee9d4f25..bd553fc720e 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -88,10 +88,9 @@ if [ $stage -le 0 ]; then cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt fi - +num_words_pocolm=110000 if [ $stage -le 1 ]; then - num_words_pocolm=110000 - local/train_pocolm.sh --stage $lmstage --num-words-pocolm 110000 "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm + local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ "$rnnlm_workdir"/rnnlm_wordlist fi From 684f029e77da3426c59e3b4106ce6b45160de088 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Thu, 28 Feb 2019 11:57:29 +0000 Subject: [PATCH 08/49] Text cleaning script for splitting Abbreviation words added --- .../s5_gigaword/local/clean_abbrevs_text.py | 33 +++++++++++++++++++ .../s5_gigaword/local/run_norm.sh | 3 ++ scripts/rnnlm/choose_features.py | 12 ++----- scripts/rnnlm/get_best_model.py | 28 ++++++++-------- scripts/rnnlm/get_embedding_dim.py | 4 +-- scripts/rnnlm/get_num_splits.sh | 2 +- scripts/rnnlm/get_special_symbol_opts.py | 8 ++--- scripts/rnnlm/get_unigram_probs.py | 18 ++++------ scripts/rnnlm/get_vocab.py | 11 +++---- scripts/rnnlm/get_word_features.py | 15 ++++----- scripts/rnnlm/lmrescore.sh | 6 ---- scripts/rnnlm/lmrescore_nbest.sh | 4 +-- scripts/rnnlm/lmrescore_pruned.sh | 17 +++------- scripts/rnnlm/prepare_rnnlm_dir.sh | 9 ++--- scripts/rnnlm/prepare_split_data.py | 13 +++----- scripts/rnnlm/rnnlm_cleanup.py | 2 +- scripts/rnnlm/show_word_features.py | 19 +++-------- scripts/rnnlm/train_rnnlm.sh | 2 +- scripts/rnnlm/validate_features.py | 7 ++-- scripts/rnnlm/validate_text_dir.py | 11 +++---- scripts/rnnlm/validate_word_features.py | 11 +++---- 21 files changed, 104 insertions(+), 131 deletions(-) create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py new file mode 100644 index 00000000000..22fc54f18cc --- /dev/null +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc., + +import os, sys +import re +import codecs + +if len(sys.argv) < 3: + print("Usage : python clean_abbrevs_text.py ") + print(" Processes the text before text normalisation to convert uppercase words as space separated letters") + sys.exit() + +inputfile=codecs.open(sys.argv[1], encoding='utf-8') +outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w+') + +for line in inputfile: + words = line.split() + textout = "" + wordcnt = 0 + for word in words: + if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word) and wordcnt>0: + print(word) + word = re.sub('\'?s', 's', word) + textout = textout + " ".join(word) + " " + else: + textout = textout + word + " " + wordcnt = wordcnt + 1 + outputfile.write(textout.strip()+ '\n') + +inputfile.close() +outputfile.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh index 4a26f6857b8..f88fecc815c 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh @@ -24,7 +24,10 @@ for i in "${punctuation_symbols[@]}"; do num_syms=$((num_syms+1)) done mkdir -p $dir/normalize/$job +local/clean_abbrevs_text.py $data/$job $data/"$job"_processed +mv $data/"$job"_processed $data/$job echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh + bash $dir/normalize/$job/substitute.sh | \ sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \ sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' > $dir/normalize/$job/text diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py index c6621e04494..799f6b6dcc8 100755 --- a/scripts/rnnlm/choose_features.py +++ b/scripts/rnnlm/choose_features.py @@ -10,12 +10,6 @@ from collections import defaultdict sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) -# because this script splits inside words, we cannot use latin-1; we actually need to know what -# what the encoding is. By default we make this utf-8; to handle encodings that are not compatible -# with utf-8 (e.g. gbk), we'll eventually have to make the encoding an option to this script. - -import re -tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. " "To be more specific, it chooses the set of features-- you compute " @@ -90,9 +84,9 @@ # and 'wordlist' is a list indexed by integer id, that returns the string-valued word. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="utf-8") as f: + with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -121,7 +115,7 @@ def read_unigram_probs(unigram_probs_file): unigram_probs = [] with open(unigram_probs_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 idx = int(fields[0]) if idx >= len(unigram_probs): diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py index 333ed8dbfc7..45487b18b0c 100755 --- a/scripts/rnnlm/get_best_model.py +++ b/scripts/rnnlm/get_best_model.py @@ -3,14 +3,14 @@ # Copyright 2017 Johns Hopkins University (author: Daniel Povey) # License: Apache 2.0. +import os import argparse -import glob -import re import sys +import re parser = argparse.ArgumentParser(description="Works out the best iteration of RNNLM training " - "based on dev-set perplexity, and prints the number corresponding " - "to that iteration", + "based on dev-set perplexity, and prints the number corresponding " + "to that iteration", epilog="E.g. " + sys.argv[0] + " exp/rnnlm_a", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -19,9 +19,10 @@ args = parser.parse_args() -num_iters = None + +num_iters=None try: - with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f: + with open(args.rnnlm_dir + "/info.txt", encoding="utf-8") as f: for line in f: a = line.split("=") if a[0] == "num_iters": @@ -35,15 +36,15 @@ sys.exit(sys.argv[0] + ": could not get num_iters from {0}/info.txt".format( args.rnnlm_dir)) -best_objf = -2000 -best_iter = -1 -for i in range(1, num_iters): +best_objf=-2000 +best_iter=-1 +for i in range(num_iters): this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i) try: - f = open(this_logfile, 'r', encoding='latin-1') + f = open(this_logfile, 'r', encoding='utf-8') except: sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile)) - this_objf = -1000 + this_objf=-1000 for line in f: m = re.search('Overall objf .* (\S+)$', str(line)) if m is not None: @@ -52,10 +53,6 @@ except Exception as e: sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format( this_logfile, line, str(e))) - # verify this iteration still has model files present - if len(glob.glob("{0}/{1}.raw".format(args.rnnlm_dir, i))) == 0: - # this iteration has log files, but model files have been cleaned up, skip it - continue if this_objf == -1000: print(sys.argv[0] + ": warning: could not parse objective function from {0}".format( this_logfile), file=sys.stderr) @@ -66,4 +63,5 @@ if best_iter == -1: sys.exit(sys.argv[0] + ": error: could not get best iteration.") + print(str(best_iter)) diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py index 63eaf307498..b6810ef2cbf 100755 --- a/scripts/rnnlm/get_embedding_dim.py +++ b/scripts/rnnlm/get_embedding_dim.py @@ -45,7 +45,7 @@ left_context=0 right_context=0 for line in out_lines: - line = line.decode('latin-1') + line = line.decode('utf-8') m = re.search(r'input-node name=input dim=(\d+)', line) if m is not None: try: @@ -101,4 +101,4 @@ "nnet '{0}': {1} != {2}".format( args.nnet, input_dim, output_dim)) -print('{}'.format(input_dim)) +print(str(input_dim)) diff --git a/scripts/rnnlm/get_num_splits.sh b/scripts/rnnlm/get_num_splits.sh index 974fd8bf204..93d1f7f169c 100755 --- a/scripts/rnnlm/get_num_splits.sh +++ b/scripts/rnnlm/get_num_splits.sh @@ -65,7 +65,7 @@ tot_with_multiplicities=0 for f in $text/*.counts; do if [ "$f" != "$text/dev.counts" ]; then - this_tot=$(cat $f | awk '{tot += $2} END{printf("%d", tot)}') + this_tot=$(cat $f | awk '{tot += $2} END{print tot}') if ! [ $this_tot -gt 0 ]; then echo "$0: there were no counts in counts file $f" 1>&2 exit 1 diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py index 4310b116ad7..13fe497faf9 100755 --- a/scripts/rnnlm/get_special_symbol_opts.py +++ b/scripts/rnnlm/get_special_symbol_opts.py @@ -8,9 +8,6 @@ import argparse import sys -import re -tab_or_space = re.compile('[ \t]+') - parser = argparse.ArgumentParser(description="This script checks whether the special symbols " "appear in words.txt with expected values, if not, it will " "print out the options with correct value to stdout, which may look like " @@ -28,10 +25,9 @@ lower_ids = {} upper_ids = {} -input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1') +input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace') for line in input_stream: - fields = re.split(tab_or_space, line) - assert(len(fields) == 2) + fields = line.split() sym = fields[0] if sym in special_symbols: assert sym not in lower_ids diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py index ab3f9bb382f..32b01728ca3 100755 --- a/scripts/rnnlm/get_unigram_probs.py +++ b/scripts/rnnlm/get_unigram_probs.py @@ -7,9 +7,6 @@ import argparse import sys -import re -tab_or_space = re.compile('[ \t]+') - parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.", epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt " "--data-weights-file=exp/rnnlm/data_weights.txt data/rnnlm/data " @@ -77,10 +74,10 @@ def get_all_data_sources_except_dev(text_dir): # value is a tuple (repeated_times_per_epoch, weight) def read_data_weights(weights_file, data_sources): data_weights = {} - with open(weights_file, 'r', encoding="latin-1") as f: + with open(weights_file, 'r', encoding="utf-8", errors='replace') as f: for line in f: try: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 3 if fields[0] in data_weights: raise Exception("duplicated data source({0}) specified in " @@ -102,9 +99,9 @@ def read_data_weights(weights_file, data_sources): # return the vocab, which is a dict mapping the word to a integer id. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="latin-1") as f: + with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -131,11 +128,10 @@ def get_counts(data_sources, data_weights, vocab): if weight == 0.0: continue - with open(counts_file, 'r', encoding="latin-1") as f: + with open(counts_file, 'r', encoding="utf-8", errors='replace') as f: for line in f: - fields = re.split(tab_or_space, line) - if len(fields) != 2: print("Warning, should be 2 cols:", fields, line, file=sys.stderr); - assert(len(fields) == 2) + fields = line.split() + assert len(fields) == 2 word = fields[0] count = fields[1] if word not in vocab: diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py index 1502e915f9c..f290ef721c1 100755 --- a/scripts/rnnlm/get_vocab.py +++ b/scripts/rnnlm/get_vocab.py @@ -6,10 +6,7 @@ import os import argparse import sys -sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) - -import re -tab_or_space = re.compile('[ \t]+') +sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts " "of words produced by get_unigram_counts.sh", @@ -28,10 +25,10 @@ # Add the count for every word in counts_file # the result is written into word_counts def add_counts(word_counts, counts_file): - with open(counts_file, 'r', encoding="latin-1") as f: + with open(counts_file, 'r', encoding="utf-8") as f: for line in f: - line = line.strip(" \t\r\n") - word_and_count = re.split(tab_or_space, line) + line = line.strip() + word_and_count = line.split() assert len(word_and_count) == 2 if word_and_count[0] in word_counts: word_counts[word_and_count[0]] += int(word_and_count[1]) diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py index aeb7a3ec6ae..8bdb553b9c8 100755 --- a/scripts/rnnlm/get_word_features.py +++ b/scripts/rnnlm/get_word_features.py @@ -9,9 +9,6 @@ import math from collections import defaultdict -import re -tab_or_space = re.compile('[ \t]+') - parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, " "using features from rnnlm/choose_features.py.", epilog="E.g. " + sys.argv[0] + " --unigram-probs=exp/rnnlm/unigram_probs.txt " @@ -41,9 +38,9 @@ # return the vocab, which is a dict mapping the word to a integer id. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="latin-1") as f: + with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -62,9 +59,9 @@ def read_vocab(vocab_file): # return a list of unigram_probs, indexed by word id def read_unigram_probs(unigram_probs_file): unigram_probs = [] - with open(unigram_probs_file, 'r', encoding="latin-1") as f: + with open(unigram_probs_file, 'r', encoding="utf-8", errors='replace') as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 2 idx = int(fields[0]) if idx >= len(unigram_probs): @@ -103,9 +100,9 @@ def read_features(features_file): feats['min_ngram_order'] = 10000 feats['max_ngram_order'] = -1 - with open(features_file, 'r', encoding="latin-1") as f: + with open(features_file, 'r', encoding="utf-8", errors='replace') as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert(len(fields) in [3, 4, 5]) feat_id = int(fields[0]) diff --git a/scripts/rnnlm/lmrescore.sh b/scripts/rnnlm/lmrescore.sh index 9da22ae75a2..cd0cf793d8d 100755 --- a/scripts/rnnlm/lmrescore.sh +++ b/scripts/rnnlm/lmrescore.sh @@ -72,12 +72,6 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ || exit 1; -if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then - # the last word of the RNNLM word list is an added word - echo "$0: Word lists mismatch for lattices and RNNLM." - exit 1 -fi - oldlm_command="fstproject --project_output=true $oldlm |" special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt) diff --git a/scripts/rnnlm/lmrescore_nbest.sh b/scripts/rnnlm/lmrescore_nbest.sh index 58b19b9fa79..f50a3c909f0 100755 --- a/scripts/rnnlm/lmrescore_nbest.sh +++ b/scripts/rnnlm/lmrescore_nbest.sh @@ -29,7 +29,7 @@ if [ $# != 6 ]; then echo "This version applies an RNNLM and mixes it with the LM scores" echo "previously in the lattices., controlled by the first parameter (rnnlm-weight)" echo "" - echo "Usage: $0 [options] " + echo "Usage: utils/rnnlmrescore.sh " echo "Main options:" echo " --inv-acwt # default 12. e.g. --inv-acwt 17. Equivalent to LM scale to use." echo " # for N-best list generation... note, we'll score at different acwt's" @@ -177,7 +177,7 @@ fi if [ $stage -le 6 ]; then echo "$0: invoking rnnlm/compute_sentence_scores.sh which calls rnnlm to get RNN LM scores." $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \ - rnnlm/compute_sentence_scores.sh $rnndir $adir.JOB/temp \ + local/rnnlm/compute_sentence_scores.sh $rnndir $adir.JOB/temp \ $adir.JOB/words_text $adir.JOB/lmwt.rnn fi if [ $stage -le 7 ]; then diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh index 9ba78415708..46ee5846424 100755 --- a/scripts/rnnlm/lmrescore_pruned.sh +++ b/scripts/rnnlm/lmrescore_pruned.sh @@ -16,18 +16,16 @@ max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram- # the same ngram history and this prevents the lattice from # exploding exponentially. Details of the n-gram approximation # method are described in section 2.3 of the paper - # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf -max_arcs= # limit the max arcs in lattice while rescoring. E.g., 20000 + # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdm +max_arcs=499 # limit the max arcs in lattice while rescoring. E.g., 20000 -acwt=0.1 -weight=0.5 # Interpolation weight for RNNLM. +acwt=1 +weight=1 # Interpolation weight for RNNLM. normalize=false # If true, we add a normalization step to the output of the RNNLM # so that it adds up to *exactly* 1. Note that this is not necessary # as in our RNNLM setup, a properly trained network would automatically # have its normalization term close to 1. The details of this # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf -lattice_prune_beam=4 # Beam used in pruned lattice composition - # This option affects speed and how large the composed lattice may be # End configuration section. @@ -75,12 +73,6 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ || exit 1; -if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then - # the last word of the RNNLM word list is an added word - echo "$0: Word lists mismatch for lattices and RNNLM." - exit 1 -fi - normalize_opt= if $normalize; then normalize_opt="--normalize-probs=true" @@ -105,7 +97,6 @@ cp $indir/num_jobs $outdir $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-lmrescore-kaldi-rnnlm-pruned --lm-scale=$weight $special_symbol_opts \ - --lattice-compose-beam=$lattice_prune_beam \ --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt $max_arcs_opt \ $carpa_option $oldlm $word_embedding "$rnnlm_dir/final.raw" \ "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh index e101822d983..d3ee44f1f95 100755 --- a/scripts/rnnlm/prepare_rnnlm_dir.sh +++ b/scripts/rnnlm/prepare_rnnlm_dir.sh @@ -23,7 +23,7 @@ if [ $# != 3 ]; then echo "Usage: $0 [options] " echo "Sets up the directory for RNNLM training as done by" echo "rnnlm/train_rnnlm.sh, and initializes the model." - echo " is as validated by rnnlm/validate_text_dir.py" + echo " is as validated by rnnlm/validate_data_dir.py" echo " is as validated by rnnlm/validate_config_dir.sh." exit 1 fi @@ -34,7 +34,6 @@ config_dir=$2 dir=$3 set -e -. ./path.sh if [ $stage -le 0 ]; then echo "$0: validating input" @@ -53,13 +52,9 @@ if [ $stage -le 1 ]; then echo "$0: copying config directory" mkdir -p $dir/config # copy expected things from $config_dir to $dir/config. - for f in words.txt data_weights.txt oov.txt xconfig; do + for f in words.txt features.txt data_weights.txt oov.txt xconfig; do cp $config_dir/$f $dir/config done - # features.txt is optional, check separately - if [ -f $config_dir/features.txt ]; then - cp $config_dir/features.txt $dir/config - fi fi rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py index cceac48313e..9cc4f69d09f 100755 --- a/scripts/rnnlm/prepare_split_data.py +++ b/scripts/rnnlm/prepare_split_data.py @@ -8,9 +8,6 @@ import argparse import sys -import re -tab_or_space = re.compile('[ \t]+') - parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, " "for consumption by nnet3-get-egs.", epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt " @@ -66,10 +63,10 @@ def get_all_data_sources_except_dev(text_dir): # value is a tuple (repeated_times_per_epoch, weight) def read_data_weights(weights_file, data_sources): data_weights = {} - with open(weights_file, 'r', encoding="latin-1") as f: + with open(weights_file, 'r', encoding="utf-8") as f: for line in f: try: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) == 3 if fields[0] in data_weights: raise Exception("duplicated data source({0}) specified in " @@ -97,7 +94,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles): num_outputs = len(output_filehandles) n = 0 try: - f = open(source_filename, 'r', encoding="latin-1") + f = open(source_filename, 'r', encoding="utf-8") except Exception as e: sys.exit(sys.argv[0] + ": failed to open file {0} for reading: {1} ".format( source_filename, str(e))) @@ -124,7 +121,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles): os.makedirs(args.split_dir + "/info") # set up the 'num_splits' file, which contains an integer. -with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="latin-1") as f: +with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="utf-8") as f: print(args.num_splits, file=f) # e.g. set temp_files = [ 'foo/1.tmp', 'foo/2.tmp', ..., 'foo/5.tmp' ] @@ -136,7 +133,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles): temp_filehandles = [] for fname in temp_files: try: - temp_filehandles.append(open(fname, 'w', encoding="latin-1")) + temp_filehandles.append(open(fname, 'w', encoding="utf-8")) except Exception as e: sys.exit(sys.argv[0] + ": failed to open file: " + str(e) + ".. if this is a max-open-filehandles limitation, you may " diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py index 40cbee7a496..6a304f7f4cb 100644 --- a/scripts/rnnlm/rnnlm_cleanup.py +++ b/scripts/rnnlm/rnnlm_cleanup.py @@ -69,7 +69,7 @@ def get_compute_prob_info(log_file): compute_prob_done = False # roughly based on code in get_best_model.py try: - f = open(log_file, "r", encoding="latin-1") + f = open(log_file, "r", encoding="utf-8") except: print(script_name + ": warning: compute_prob log not found for iteration " + str(iter) + ". Skipping", diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py index 89b134adaf9..89d84d53f3e 100755 --- a/scripts/rnnlm/show_word_features.py +++ b/scripts/rnnlm/show_word_features.py @@ -6,16 +6,7 @@ import os import argparse import sys - -# The use of latin-1 encoding does not preclude reading utf-8. latin-1 encoding -# means "treat words as sequences of bytes", and it is compatible with utf-8 -# encoding as well as other encodings such as gbk, as long as the spaces are -# also spaces in ascii (which we check). It is basically how we emulate the -# behavior of python before python3. -sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) - -import re -tab_or_space = re.compile('[ \t]+') +sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.", epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt " @@ -36,9 +27,9 @@ def read_feature_type_and_key(features_file): feat_types = {} - with open(features_file, 'r', encoding="latin-1") as f: + with open(features_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert(len(fields) in [2, 3, 4]) feat_id = int(fields[0]) @@ -53,9 +44,9 @@ def read_feature_type_and_key(features_file): feat_type_and_key = read_feature_type_and_key(args.features_file) num_word_feats = 0 -with open(args.word_features_file, 'r', encoding="latin-1") as f: +with open(args.word_features_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) % 2 == 1 print(int(fields[0]), end='\t') diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh index 013e9a56c2f..f056d096120 100755 --- a/scripts/rnnlm/train_rnnlm.sh +++ b/scripts/rnnlm/train_rnnlm.sh @@ -41,7 +41,7 @@ use_gpu_for_diagnostics=false # set true to use GPU for compute_prob_*.log # optional cleanup options cleanup=false # add option --cleanup true to enable automatic cleanup of old models cleanup_strategy="keep_latest" # determines cleanup strategy, use either "keep_latest" or "keep_best" -cleanup_keep_iters=3 # number of iterations that will have their models retained +cleanup_keep_iters=100 # number of iterations that will have their models retained trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM . utils/parse_options.sh diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py index 2a077da4758..a650092b086 100755 --- a/scripts/rnnlm/validate_features.py +++ b/scripts/rnnlm/validate_features.py @@ -7,9 +7,6 @@ import argparse import sys -import re -tab_or_space = re.compile('[ \t]+') - parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.", epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -24,7 +21,7 @@ if not os.path.isfile(args.features_file): sys.exit(sys.argv[0] + ": Expected file {0} to exist".format(args.features_file)) -with open(args.features_file, 'r', encoding="latin-1") as f: +with open(args.features_file, 'r', encoding="utf-8") as f: has_unigram = False has_length = False idx = 0 @@ -33,7 +30,7 @@ final_feats = {} word_feats = {} for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert(len(fields) in [3, 4, 5]) assert idx == int(fields[0]) diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py index 903e720bdf4..d644d77911e 100755 --- a/scripts/rnnlm/validate_text_dir.py +++ b/scripts/rnnlm/validate_text_dir.py @@ -7,9 +7,6 @@ import argparse import sys -import re -tab_or_space = re.compile('[ \t]+') - parser = argparse.ArgumentParser(description="Validates data directory containing text " "files from one or more data sources, including dev.txt.", epilog="E.g. " + sys.argv[0] + " data/rnnlm/data", @@ -40,7 +37,7 @@ def check_text_file(text_file): - with open(text_file, 'r', encoding="latin-1") as f: + with open(text_file, 'r', encoding="utf-8") as f: found_nonempty_line = False lineno = 0 if args.allow_internal_eos == 'true': @@ -54,7 +51,7 @@ def check_text_file(text_file): lineno += 1 if args.spot_check == 'true' and lineno > 10: break - words = re.split(tab_or_space, line) + words = line.split() if len(words) != 0: found_nonempty_line = True for word in words: @@ -76,9 +73,9 @@ def check_text_file(text_file): # with some kind of utterance-id first_field_set = set() other_fields_set = set() - with open(text_file, 'r', encoding="latin-1") as f: + with open(text_file, 'r', encoding="utf-8") as f: for line in f: - array = re.split(tab_or_space, line) + array = line.split() if len(array) > 0: first_word = array[0] if first_word in first_field_set or first_word in other_fields_set: diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py index 205b934ae1b..3dc9b23aa41 100755 --- a/scripts/rnnlm/validate_word_features.py +++ b/scripts/rnnlm/validate_word_features.py @@ -7,9 +7,6 @@ import argparse import sys -import re -tab_or_space = re.compile('[ \t]+') - parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.", epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt " "exp/rnnlm/word_feats.txt", @@ -28,9 +25,9 @@ unigram_feat_id = -1 length_feat_id = -1 max_feat_id = -1 -with open(args.features_file, 'r', encoding="latin-1") as f: +with open(args.features_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert(len(fields) in [3, 4, 5]) feat_id = int(fields[0]) @@ -52,9 +49,9 @@ if feat_id > max_feat_id: max_feat_id = feat_id -with open(args.word_features_file, 'r', encoding="latin-1") as f: +with open(args.word_features_file, 'r', encoding="utf-8") as f: for line in f: - fields = re.split(tab_or_space, line) + fields = line.split() assert len(fields) > 0 and len(fields) % 2 == 1 word_id = int(fields[0]) From 185da3aa1afd4b5dda886607a504b83394e8a13f Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Thu, 28 Feb 2019 18:17:53 +0530 Subject: [PATCH 09/49] Update clean_txt_dir.sh --- .../s5_gigaword/local/clean_txt_dir.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh index 56891328a89..0f06c037080 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh @@ -6,7 +6,7 @@ stage=0 nj=500 -. ./path.sh +. ./path_venv.sh . ./cmd.sh . ./utils/parse_options.sh @@ -38,7 +38,7 @@ if [ $stage -le 0 ]; then $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \ local/run_norm.sh \ sparrowhawk_configuration.ascii_proto \ - $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \ + $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \ $outdir/data \ JOB \ $outdir/sparrowhawk/ From cb393c81f678b704aa14de2b0d304ce4191a1026 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Thu, 28 Feb 2019 18:22:12 +0530 Subject: [PATCH 10/49] Update clean_txt_dir.sh --- egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh index 0f06c037080..60269c0ab7e 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh @@ -6,7 +6,7 @@ stage=0 nj=500 -. ./path_venv.sh +. ./path.sh . ./cmd.sh . ./utils/parse_options.sh From 18a9cb6fe0927fbda13311e0bb4399c3e495e9e2 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Thu, 28 Feb 2019 18:23:25 +0530 Subject: [PATCH 11/49] Update train_pocolm.sh --- egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh index c8adb79383e..964dd3bbcc5 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh @@ -4,7 +4,7 @@ stage=-2 num_words_pocolm=110000 prune_size=1000000 -. ./path_venv.sh +. ./path.sh . ./cmd.sh . ./utils/parse_options.sh From b023638357122da580ea41a8230b4e7ee2b5c69f Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Thu, 28 Feb 2019 18:23:55 +0530 Subject: [PATCH 12/49] Update pocolm_cust.sh --- egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh index c6642f6fcf4..422db15937a 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh @@ -5,7 +5,7 @@ set -euo pipefail -. ./path_venv.sh +. ./path.sh export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P) export PATH=$PATH:$POCOLM_ROOT/scripts From 46550f0c598d50df636e5c181611566a7b211085 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Thu, 28 Feb 2019 13:04:51 +0000 Subject: [PATCH 13/49] Cosmetic fixes --- .../s5_gigaword/local/clean_abbrevs_text.py | 7 ++++--- .../s5_gigaword/local/get_unigram_weights_vocab.py | 2 +- egs/fisher_callhome_spanish/s5_gigaword/path.sh | 2 +- .../s5_gigaword/path_venv.sh | 13 ------------- 4 files changed, 6 insertions(+), 18 deletions(-) delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py index 22fc54f18cc..e5dfcd07a1c 100644 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # 2018 Saikiran Valluri, GoVivace inc., @@ -13,7 +13,7 @@ sys.exit() inputfile=codecs.open(sys.argv[1], encoding='utf-8') -outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w+') +outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w') for line in inputfile: words = line.split() @@ -26,7 +26,8 @@ textout = textout + " ".join(word) + " " else: textout = textout + word + " " - wordcnt = wordcnt + 1 + if word.isalpha(): + wordcnt = wordcnt + 1 outputfile.write(textout.strip()+ '\n') inputfile.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py index 43cf8392167..3ecd16772d7 100644 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # 2018 Saikiran Valluri, GoVivace inc. diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh index 2fc3de37406..80edbbaf69a 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh @@ -10,4 +10,4 @@ export PATH=$SPARROWHAWK_ROOT/bin:$PATH export LC_ALL=C.UTF-8 export LANG=C.UTF-8 - +source ~/anaconda/bin/activate py36 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh b/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh deleted file mode 100755 index 80edbbaf69a..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh +++ /dev/null @@ -1,13 +0,0 @@ -export KALDI_ROOT=`pwd`/../../../../kaldi -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH -[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 -. $KALDI_ROOT/tools/config/common_path.sh -export LD_LIBRARY_PATH=/home/dpovey/libs - -export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk -export PATH=$SPARROWHAWK_ROOT/bin:$PATH -export LC_ALL=C.UTF-8 -export LANG=C.UTF-8 - -source ~/anaconda/bin/activate py36 From ce3c7d7a2169113fb6bb7fd0b395250f4f123c12 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Thu, 28 Feb 2019 18:41:36 +0530 Subject: [PATCH 14/49] Update path.sh --- egs/fisher_callhome_spanish/s5_gigaword/path.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh index 80edbbaf69a..d2c2937d81e 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh @@ -9,5 +9,3 @@ export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk export PATH=$SPARROWHAWK_ROOT/bin:$PATH export LC_ALL=C.UTF-8 export LANG=C.UTF-8 - -source ~/anaconda/bin/activate py36 From deeaaa76ce6a89fd500a917f0793eaab93d63356 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Fri, 1 Mar 2019 07:22:40 -0500 Subject: [PATCH 15/49] Bug fix in text normalisation script for gigaword corpus --- .../s5_gigaword/local/clean_abbrevs_text.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py index e5dfcd07a1c..a6edc0f92c5 100644 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py @@ -20,14 +20,14 @@ textout = "" wordcnt = 0 for word in words: - if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word) and wordcnt>0: - print(word) - word = re.sub('\'?s', 's', word) - textout = textout + " ".join(word) + " " + if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word): + if wordcnt > 0: + word = re.sub('\'?s', 's', word) + textout = textout + " ".join(word) + " " + else: + textout = textout + word + " " else: - textout = textout + word + " " - if word.isalpha(): - wordcnt = wordcnt + 1 + if word.isalpha(): wordcnt = wordcnt + 1 outputfile.write(textout.strip()+ '\n') inputfile.close() From 633f21d33a53228ca870821ce6a2e5a432c4e9f6 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Fri, 1 Mar 2019 20:26:37 +0530 Subject: [PATCH 16/49] small Fix path.sh --- egs/fisher_callhome_spanish/s5_gigaword/path.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh index d2c2937d81e..e622e7d5051 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh @@ -1,4 +1,4 @@ -export KALDI_ROOT=`pwd`/../../../../kaldi +export KALDI_ROOT=`pwd`/../../../ [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 From 8d6b14d1f75c9f532ab945e1328c8d925cf21064 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Fri, 1 Mar 2019 21:17:29 +0530 Subject: [PATCH 17/49] Update clean_abbrevs_text.py --- .../s5_gigaword/local/clean_abbrevs_text.py | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py index a6edc0f92c5..7d92eb9fe3a 100644 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py @@ -27,6 +27,7 @@ else: textout = textout + word + " " else: + textout = textout + word + " " if word.isalpha(): wordcnt = wordcnt + 1 outputfile.write(textout.strip()+ '\n') From 8c9c37bad8eba62d20231dda0d34553a6ce12c1b Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Fri, 1 Mar 2019 15:54:00 +0000 Subject: [PATCH 18/49] Added sparrowhawk installation script for text normalisation --- tools/install_sparrowhawk.sh | 73 ++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100755 tools/install_sparrowhawk.sh diff --git a/tools/install_sparrowhawk.sh b/tools/install_sparrowhawk.sh new file mode 100755 index 00000000000..f9bbcb1b28e --- /dev/null +++ b/tools/install_sparrowhawk.sh @@ -0,0 +1,73 @@ +#!/bin/bash +export LDFLAGS="-L`pwd`/openfst/lib" +export CXXFLAGS="-I`pwd`/openfst/include" +stage=0 + +if [ $stage -le 0 ] ; then + git clone -b feature/Spanish_normalizer https://github.com/spokencloud/sparrowhawk-resources.git || exit 1; + patch -p0 < sparrowhawk-resources/local/Makefile.patch || exit 1; + make openfst || exit 1; + git clone https://github.com/mjansche/thrax.git + export LDFLAGS=-L`pwd`/openfst/lib + export CXXFLAGS=-I`pwd`/openfst/include + cd thrax + autoreconf --force --install || exit 1; + ./configure --prefix=`pwd` || exit 1; + make || exit 1; + make install || exit 1; + cd .. + git clone https://github.com/google/re2.git || exit 1; + cd re2/ + make -j 20 || exit 1; + make test || exit 1; + make install prefix=`pwd` || exit 1; + cd .. + git clone https://github.com/google/protobuf.git || exit 1; + cd protobuf/ + ./autogen.sh || exit 1; + ./configure --prefix=`pwd` || exit 1; + make -j 20 || exit 1; + make install || exit 1; + cd .. +fi + +if [ $stage -le 1 ]; then + git clone https://github.com/google/sparrowhawk.git || exit 1; + patch -p0 < sparrowhawk-resources/local/sparrowhawk.patch || exit 1; + cd sparrowhawk/ || exit 1; + mkdir lib + mkdir bin + mkdir include + cp -r ../openfst/lib/* lib/ || exit 1; + cp -r ../protobuf/lib/* lib/ || exit 1; + cp -r ../re2/lib/* lib/ || exit 1; + cp -r ../thrax/lib/* lib/ || exit 1; + cp -r ../openfst/include/* include/ || exit 1; + cp -r ../protobuf/include/* include/ || exit 1; + cp -r ../re2/include/* include/ || exit 1; + cp -r ../thrax/include/* include/ || exit 1; + cp ../protobuf/bin/protoc bin/. || exit 1; + export PATH=`pwd`/bin:$PATH + aclocal || exit 1; + automake || exit 1; + ./configure --prefix=`pwd` CPPFLAGS="-I`pwd`/include" LDFLAGS="-L`pwd`/lib" || exit 1; + make || exit 1; + make install || exit 1; + cd .. +fi + +if [ $stage -le 2 ]; then + source ~/anaconda/bin/activate py27 || exit 1; + cp -r sparrowhawk-resources/language-resources sparrowhawk/ || exit 1; + cd sparrowhawk/language-resources/en/textnorm/classifier || exit 1; + . ./path.sh || exit 1; + python create_far.py ascii.syms universal_depot_ascii universal_depot universal_depot.far + thraxmakedep tokenize_and_classify.grm || exit 1; + make || exit 1; + cd ../verbalizer + python create_far.py ascii.syms number_names_depot_ascii number_names_depot number_names_depot.far + cp -r ../classifier/universal_depot.far . + thraxmakedep verbalize.grm || exit 1; + make || exit 1; + cd ../../../../.. +fi From c6b05d18597612170148a3ad7b313dc192d62de4 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Sat, 2 Mar 2019 06:02:57 +0000 Subject: [PATCH 19/49] G2P training stage added into Spanish gigaword recipe --- egs/fisher_callhome_spanish/s5_gigaword/run.sh | 8 ++++++++ tools/extras/install_g2p_seq2seq.sh | 5 +++++ tools/install_g2p_seq2seq.sh | 1 + 3 files changed, 14 insertions(+) create mode 100644 tools/extras/install_g2p_seq2seq.sh create mode 120000 tools/install_g2p_seq2seq.sh diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index bd553fc720e..7e488cdc5fa 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -39,6 +39,9 @@ if [ $stage -le -1 ]; then # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted # wordlist is downloaded if it is not available. local/fsp_prepare_dict.sh $spanish_lexicon + ( + steps/dict/train_g2p_seq2seq.sh data/local/dict/lexicon.txt exp/g2p || touch exp/g2p/.error + ) & # Added c,j, v to the non silences phones manually utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang @@ -74,6 +77,11 @@ if [ $stage -le -1 ]; then local/create_splits.sh $split local/callhome_create_splits.sh $split_callhome + wait # wait till G2P training finishes + if [ -f exp/g2p/.error ]; then + rm exp/g2p/.error || true + echo "Fail to train the G2P model." && exit 1; + fi fi if [ $stage -le 0 ]; then diff --git a/tools/extras/install_g2p_seq2seq.sh b/tools/extras/install_g2p_seq2seq.sh new file mode 100644 index 00000000000..c9979b8b961 --- /dev/null +++ b/tools/extras/install_g2p_seq2seq.sh @@ -0,0 +1,5 @@ +if [ ! -e g2p-seq2seq ];then + git clone https://github.com/cmusphinx/g2p-seq2seq.git + cd g2p-seq2seq/ + python setup.py install +fi diff --git a/tools/install_g2p_seq2seq.sh b/tools/install_g2p_seq2seq.sh new file mode 120000 index 00000000000..77715305f74 --- /dev/null +++ b/tools/install_g2p_seq2seq.sh @@ -0,0 +1 @@ +extras/install_g2p_seq2seq.sh \ No newline at end of file From 8c226cc9b0995c9a656a20484587d46ed28e5fee Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Sat, 2 Mar 2019 06:06:28 +0000 Subject: [PATCH 20/49] G2P seq2seq scripts added in steps/ --- egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh | 42 ++++++++++++++++++++++ egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh | 39 ++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh create mode 100644 egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh diff --git a/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh new file mode 100644 index 00000000000..77a08c305dd --- /dev/null +++ b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Copyright 2018 Govivace Inc. (Author: Valluri Saikiran) +# Apache License 2.0 + +# This script applies a g2p model using CMUsphinx/seq2seq. + +stage=0 +encoding='utf-8' + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +set -u +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 [options] " + echo " where is the training lexicon (one pronunciation per " + echo " word per line, with lines like 'hello h uh l ow') and" + echo " is directory where the models will be stored" + exit 1; +fi + +lexicon=$1 +wdir=$2 +outdir=$3 + +mkdir -p $outdir + +[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit + +if [ ! -s `which g2p-seq2seq` ] ; then + echo "g2p-seq2seq was not found !" + echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh" + exit 1 +fi + +g2p-seq2seq --decode $lexicon --model_dir $wdir --output $outdir/lexicon.lex + diff --git a/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh new file mode 100644 index 00000000000..e0389171fd5 --- /dev/null +++ b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# Copyright 2018 Govivace Inc. (Author: Valluri Saikiran) +# Apache License 2.0 + +# This script trains a g2p model using CMUsphinx/seq2seq. + +stage=0 +encoding='utf-8' + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +set -u +set -e + +if [ $# != 2 ]; then + echo "Usage: $0 [options] " + echo " where is the training lexicon (one pronunciation per " + echo " word per line, with lines like 'hello h uh l ow') and" + echo " is directory where the models will be stored" + exit 1; +fi + +lexicon=$1 +wdir=$2 + +[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit + +if [ ! -s `which g2p-seq2seq` ]; then + echo "g2p-seq2seq was not found !" + echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh" + exit 1 +fi + +g2p-seq2seq --max_epochs 12 --train $lexicon --model_dir $wdir + From 7b67fc2ade32fa7449a3a228903c920f499a2c3c Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Sat, 2 Mar 2019 12:09:40 +0000 Subject: [PATCH 21/49] RNNLM scripts updated to UTF8 encoding --- scripts/rnnlm/choose_features.py | 12 +++++++++--- scripts/rnnlm/get_best_model.py | 24 +++++++++++++----------- scripts/rnnlm/get_embedding_dim.py | 2 +- scripts/rnnlm/get_num_splits.sh | 2 +- scripts/rnnlm/get_special_symbol_opts.py | 8 ++++++-- scripts/rnnlm/get_unigram_probs.py | 18 +++++++++++------- scripts/rnnlm/get_vocab.py | 7 +++++-- scripts/rnnlm/get_word_features.py | 15 +++++++++------ scripts/rnnlm/lmrescore.sh | 6 ++++++ scripts/rnnlm/lmrescore_nbest.sh | 4 ++-- scripts/rnnlm/lmrescore_pruned.sh | 17 +++++++++++++---- scripts/rnnlm/prepare_rnnlm_dir.sh | 9 +++++++-- scripts/rnnlm/prepare_split_data.py | 5 ++++- scripts/rnnlm/show_word_features.py | 13 +++++++++++-- scripts/rnnlm/train_rnnlm.sh | 2 +- scripts/rnnlm/validate_features.py | 5 ++++- scripts/rnnlm/validate_text_dir.py | 7 +++++-- scripts/rnnlm/validate_word_features.py | 7 +++++-- 18 files changed, 113 insertions(+), 50 deletions(-) diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py index 799f6b6dcc8..c6621e04494 100755 --- a/scripts/rnnlm/choose_features.py +++ b/scripts/rnnlm/choose_features.py @@ -10,6 +10,12 @@ from collections import defaultdict sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) +# because this script splits inside words, we cannot use latin-1; we actually need to know what +# what the encoding is. By default we make this utf-8; to handle encodings that are not compatible +# with utf-8 (e.g. gbk), we'll eventually have to make the encoding an option to this script. + +import re +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. " "To be more specific, it chooses the set of features-- you compute " @@ -84,9 +90,9 @@ # and 'wordlist' is a list indexed by integer id, that returns the string-valued word. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f: + with open(vocab_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -115,7 +121,7 @@ def read_unigram_probs(unigram_probs_file): unigram_probs = [] with open(unigram_probs_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 idx = int(fields[0]) if idx >= len(unigram_probs): diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py index 45487b18b0c..ed266346e06 100755 --- a/scripts/rnnlm/get_best_model.py +++ b/scripts/rnnlm/get_best_model.py @@ -3,14 +3,14 @@ # Copyright 2017 Johns Hopkins University (author: Daniel Povey) # License: Apache 2.0. -import os import argparse -import sys +import glob import re +import sys parser = argparse.ArgumentParser(description="Works out the best iteration of RNNLM training " - "based on dev-set perplexity, and prints the number corresponding " - "to that iteration", + "based on dev-set perplexity, and prints the number corresponding " + "to that iteration", epilog="E.g. " + sys.argv[0] + " exp/rnnlm_a", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -19,8 +19,7 @@ args = parser.parse_args() - -num_iters=None +num_iters = None try: with open(args.rnnlm_dir + "/info.txt", encoding="utf-8") as f: for line in f: @@ -36,15 +35,15 @@ sys.exit(sys.argv[0] + ": could not get num_iters from {0}/info.txt".format( args.rnnlm_dir)) -best_objf=-2000 -best_iter=-1 -for i in range(num_iters): +best_objf = -2000 +best_iter = -1 +for i in range(1, num_iters): this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i) try: f = open(this_logfile, 'r', encoding='utf-8') except: sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile)) - this_objf=-1000 + this_objf = -1000 for line in f: m = re.search('Overall objf .* (\S+)$', str(line)) if m is not None: @@ -53,6 +52,10 @@ except Exception as e: sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format( this_logfile, line, str(e))) + # verify this iteration still has model files present + if len(glob.glob("{0}/{1}.raw".format(args.rnnlm_dir, i))) == 0: + # this iteration has log files, but model files have been cleaned up, skip it + continue if this_objf == -1000: print(sys.argv[0] + ": warning: could not parse objective function from {0}".format( this_logfile), file=sys.stderr) @@ -63,5 +66,4 @@ if best_iter == -1: sys.exit(sys.argv[0] + ": error: could not get best iteration.") - print(str(best_iter)) diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py index b6810ef2cbf..1d516e0edf5 100755 --- a/scripts/rnnlm/get_embedding_dim.py +++ b/scripts/rnnlm/get_embedding_dim.py @@ -101,4 +101,4 @@ "nnet '{0}': {1} != {2}".format( args.nnet, input_dim, output_dim)) -print(str(input_dim)) +print('{}'.format(input_dim)) diff --git a/scripts/rnnlm/get_num_splits.sh b/scripts/rnnlm/get_num_splits.sh index 93d1f7f169c..974fd8bf204 100755 --- a/scripts/rnnlm/get_num_splits.sh +++ b/scripts/rnnlm/get_num_splits.sh @@ -65,7 +65,7 @@ tot_with_multiplicities=0 for f in $text/*.counts; do if [ "$f" != "$text/dev.counts" ]; then - this_tot=$(cat $f | awk '{tot += $2} END{print tot}') + this_tot=$(cat $f | awk '{tot += $2} END{printf("%d", tot)}') if ! [ $this_tot -gt 0 ]; then echo "$0: there were no counts in counts file $f" 1>&2 exit 1 diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py index 13fe497faf9..0cf8e10feca 100755 --- a/scripts/rnnlm/get_special_symbol_opts.py +++ b/scripts/rnnlm/get_special_symbol_opts.py @@ -8,6 +8,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script checks whether the special symbols " "appear in words.txt with expected values, if not, it will " "print out the options with correct value to stdout, which may look like " @@ -25,9 +28,10 @@ lower_ids = {} upper_ids = {} -input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace') +input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') for line in input_stream: - fields = line.split() + fields = re.split(tab_or_space, line) + assert(len(fields) == 2) sym = fields[0] if sym in special_symbols: assert sym not in lower_ids diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py index 32b01728ca3..d115b6f54bf 100755 --- a/scripts/rnnlm/get_unigram_probs.py +++ b/scripts/rnnlm/get_unigram_probs.py @@ -7,6 +7,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.", epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt " "--data-weights-file=exp/rnnlm/data_weights.txt data/rnnlm/data " @@ -74,10 +77,10 @@ def get_all_data_sources_except_dev(text_dir): # value is a tuple (repeated_times_per_epoch, weight) def read_data_weights(weights_file, data_sources): data_weights = {} - with open(weights_file, 'r', encoding="utf-8", errors='replace') as f: + with open(weights_file, 'r', encoding="utf-8") as f: for line in f: try: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 3 if fields[0] in data_weights: raise Exception("duplicated data source({0}) specified in " @@ -99,9 +102,9 @@ def read_data_weights(weights_file, data_sources): # return the vocab, which is a dict mapping the word to a integer id. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f: + with open(vocab_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -128,10 +131,11 @@ def get_counts(data_sources, data_weights, vocab): if weight == 0.0: continue - with open(counts_file, 'r', encoding="utf-8", errors='replace') as f: + with open(counts_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() - assert len(fields) == 2 + fields = re.split(tab_or_space, line) + if len(fields) != 2: print("Warning, should be 2 cols:", fields, line, file=sys.stderr); + assert(len(fields) == 2) word = fields[0] count = fields[1] if word not in vocab: diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py index f290ef721c1..d65f8e3669b 100755 --- a/scripts/rnnlm/get_vocab.py +++ b/scripts/rnnlm/get_vocab.py @@ -8,6 +8,9 @@ import sys sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) +import re +tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts " "of words produced by get_unigram_counts.sh", epilog="E.g. " + sys.argv[0] + " data/rnnlm/data > data/rnnlm/vocab/words.txt", @@ -27,8 +30,8 @@ def add_counts(word_counts, counts_file): with open(counts_file, 'r', encoding="utf-8") as f: for line in f: - line = line.strip() - word_and_count = line.split() + line = line.strip(" \t\r\n") + word_and_count = re.split(tab_or_space, line) assert len(word_and_count) == 2 if word_and_count[0] in word_counts: word_counts[word_and_count[0]] += int(word_and_count[1]) diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py index 8bdb553b9c8..7555b774b83 100755 --- a/scripts/rnnlm/get_word_features.py +++ b/scripts/rnnlm/get_word_features.py @@ -9,6 +9,9 @@ import math from collections import defaultdict +import re +tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, " "using features from rnnlm/choose_features.py.", epilog="E.g. " + sys.argv[0] + " --unigram-probs=exp/rnnlm/unigram_probs.txt " @@ -38,9 +41,9 @@ # return the vocab, which is a dict mapping the word to a integer id. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f: + with open(vocab_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -59,9 +62,9 @@ def read_vocab(vocab_file): # return a list of unigram_probs, indexed by word id def read_unigram_probs(unigram_probs_file): unigram_probs = [] - with open(unigram_probs_file, 'r', encoding="utf-8", errors='replace') as f: + with open(unigram_probs_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 idx = int(fields[0]) if idx >= len(unigram_probs): @@ -100,9 +103,9 @@ def read_features(features_file): feats['min_ngram_order'] = 10000 feats['max_ngram_order'] = -1 - with open(features_file, 'r', encoding="utf-8", errors='replace') as f: + with open(features_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert(len(fields) in [3, 4, 5]) feat_id = int(fields[0]) diff --git a/scripts/rnnlm/lmrescore.sh b/scripts/rnnlm/lmrescore.sh index cd0cf793d8d..9da22ae75a2 100755 --- a/scripts/rnnlm/lmrescore.sh +++ b/scripts/rnnlm/lmrescore.sh @@ -72,6 +72,12 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ || exit 1; +if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then + # the last word of the RNNLM word list is an added word + echo "$0: Word lists mismatch for lattices and RNNLM." + exit 1 +fi + oldlm_command="fstproject --project_output=true $oldlm |" special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt) diff --git a/scripts/rnnlm/lmrescore_nbest.sh b/scripts/rnnlm/lmrescore_nbest.sh index f50a3c909f0..58b19b9fa79 100755 --- a/scripts/rnnlm/lmrescore_nbest.sh +++ b/scripts/rnnlm/lmrescore_nbest.sh @@ -29,7 +29,7 @@ if [ $# != 6 ]; then echo "This version applies an RNNLM and mixes it with the LM scores" echo "previously in the lattices., controlled by the first parameter (rnnlm-weight)" echo "" - echo "Usage: utils/rnnlmrescore.sh " + echo "Usage: $0 [options] " echo "Main options:" echo " --inv-acwt # default 12. e.g. --inv-acwt 17. Equivalent to LM scale to use." echo " # for N-best list generation... note, we'll score at different acwt's" @@ -177,7 +177,7 @@ fi if [ $stage -le 6 ]; then echo "$0: invoking rnnlm/compute_sentence_scores.sh which calls rnnlm to get RNN LM scores." $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \ - local/rnnlm/compute_sentence_scores.sh $rnndir $adir.JOB/temp \ + rnnlm/compute_sentence_scores.sh $rnndir $adir.JOB/temp \ $adir.JOB/words_text $adir.JOB/lmwt.rnn fi if [ $stage -le 7 ]; then diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh index 46ee5846424..9ba78415708 100755 --- a/scripts/rnnlm/lmrescore_pruned.sh +++ b/scripts/rnnlm/lmrescore_pruned.sh @@ -16,16 +16,18 @@ max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram- # the same ngram history and this prevents the lattice from # exploding exponentially. Details of the n-gram approximation # method are described in section 2.3 of the paper - # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdm -max_arcs=499 # limit the max arcs in lattice while rescoring. E.g., 20000 + # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf +max_arcs= # limit the max arcs in lattice while rescoring. E.g., 20000 -acwt=1 -weight=1 # Interpolation weight for RNNLM. +acwt=0.1 +weight=0.5 # Interpolation weight for RNNLM. normalize=false # If true, we add a normalization step to the output of the RNNLM # so that it adds up to *exactly* 1. Note that this is not necessary # as in our RNNLM setup, a properly trained network would automatically # have its normalization term close to 1. The details of this # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf +lattice_prune_beam=4 # Beam used in pruned lattice composition + # This option affects speed and how large the composed lattice may be # End configuration section. @@ -73,6 +75,12 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ || exit 1; +if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then + # the last word of the RNNLM word list is an added word + echo "$0: Word lists mismatch for lattices and RNNLM." + exit 1 +fi + normalize_opt= if $normalize; then normalize_opt="--normalize-probs=true" @@ -97,6 +105,7 @@ cp $indir/num_jobs $outdir $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-lmrescore-kaldi-rnnlm-pruned --lm-scale=$weight $special_symbol_opts \ + --lattice-compose-beam=$lattice_prune_beam \ --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt $max_arcs_opt \ $carpa_option $oldlm $word_embedding "$rnnlm_dir/final.raw" \ "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh index d3ee44f1f95..e101822d983 100755 --- a/scripts/rnnlm/prepare_rnnlm_dir.sh +++ b/scripts/rnnlm/prepare_rnnlm_dir.sh @@ -23,7 +23,7 @@ if [ $# != 3 ]; then echo "Usage: $0 [options] " echo "Sets up the directory for RNNLM training as done by" echo "rnnlm/train_rnnlm.sh, and initializes the model." - echo " is as validated by rnnlm/validate_data_dir.py" + echo " is as validated by rnnlm/validate_text_dir.py" echo " is as validated by rnnlm/validate_config_dir.sh." exit 1 fi @@ -34,6 +34,7 @@ config_dir=$2 dir=$3 set -e +. ./path.sh if [ $stage -le 0 ]; then echo "$0: validating input" @@ -52,9 +53,13 @@ if [ $stage -le 1 ]; then echo "$0: copying config directory" mkdir -p $dir/config # copy expected things from $config_dir to $dir/config. - for f in words.txt features.txt data_weights.txt oov.txt xconfig; do + for f in words.txt data_weights.txt oov.txt xconfig; do cp $config_dir/$f $dir/config done + # features.txt is optional, check separately + if [ -f $config_dir/features.txt ]; then + cp $config_dir/features.txt $dir/config + fi fi rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py index 9cc4f69d09f..adcb164771d 100755 --- a/scripts/rnnlm/prepare_split_data.py +++ b/scripts/rnnlm/prepare_split_data.py @@ -8,6 +8,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, " "for consumption by nnet3-get-egs.", epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt " @@ -66,7 +69,7 @@ def read_data_weights(weights_file, data_sources): with open(weights_file, 'r', encoding="utf-8") as f: for line in f: try: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 3 if fields[0] in data_weights: raise Exception("duplicated data source({0}) specified in " diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py index 89d84d53f3e..8b69fbb7d8a 100755 --- a/scripts/rnnlm/show_word_features.py +++ b/scripts/rnnlm/show_word_features.py @@ -6,8 +6,17 @@ import os import argparse import sys + +# The use of latin-1 encoding does not preclude reading utf-8. latin-1 encoding +# means "treat words as sequences of bytes", and it is compatible with utf-8 +# encoding as well as other encodings such as gbk, as long as the spaces are +# also spaces in ascii (which we check). It is basically how we emulate the +# behavior of python before python3. sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) +import re +tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.", epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt " "> exp/rnnlm/word_feats.str.txt", @@ -29,7 +38,7 @@ def read_feature_type_and_key(features_file): with open(features_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert(len(fields) in [2, 3, 4]) feat_id = int(fields[0]) @@ -46,7 +55,7 @@ def read_feature_type_and_key(features_file): num_word_feats = 0 with open(args.word_features_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) % 2 == 1 print(int(fields[0]), end='\t') diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh index f056d096120..013e9a56c2f 100755 --- a/scripts/rnnlm/train_rnnlm.sh +++ b/scripts/rnnlm/train_rnnlm.sh @@ -41,7 +41,7 @@ use_gpu_for_diagnostics=false # set true to use GPU for compute_prob_*.log # optional cleanup options cleanup=false # add option --cleanup true to enable automatic cleanup of old models cleanup_strategy="keep_latest" # determines cleanup strategy, use either "keep_latest" or "keep_best" -cleanup_keep_iters=100 # number of iterations that will have their models retained +cleanup_keep_iters=3 # number of iterations that will have their models retained trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM . utils/parse_options.sh diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py index a650092b086..939e634592c 100755 --- a/scripts/rnnlm/validate_features.py +++ b/scripts/rnnlm/validate_features.py @@ -7,6 +7,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.", epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -30,7 +33,7 @@ final_feats = {} word_feats = {} for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert(len(fields) in [3, 4, 5]) assert idx == int(fields[0]) diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py index d644d77911e..61914e4836a 100755 --- a/scripts/rnnlm/validate_text_dir.py +++ b/scripts/rnnlm/validate_text_dir.py @@ -7,6 +7,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="Validates data directory containing text " "files from one or more data sources, including dev.txt.", epilog="E.g. " + sys.argv[0] + " data/rnnlm/data", @@ -51,7 +54,7 @@ def check_text_file(text_file): lineno += 1 if args.spot_check == 'true' and lineno > 10: break - words = line.split() + words = re.split(tab_or_space, line) if len(words) != 0: found_nonempty_line = True for word in words: @@ -75,7 +78,7 @@ def check_text_file(text_file): other_fields_set = set() with open(text_file, 'r', encoding="utf-8") as f: for line in f: - array = line.split() + array = re.split(tab_or_space, line) if len(array) > 0: first_word = array[0] if first_word in first_field_set or first_word in other_fields_set: diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py index 3dc9b23aa41..303daf28bb1 100755 --- a/scripts/rnnlm/validate_word_features.py +++ b/scripts/rnnlm/validate_word_features.py @@ -7,6 +7,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]+') + parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.", epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt " "exp/rnnlm/word_feats.txt", @@ -27,7 +30,7 @@ max_feat_id = -1 with open(args.features_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert(len(fields) in [3, 4, 5]) feat_id = int(fields[0]) @@ -51,7 +54,7 @@ with open(args.word_features_file, 'r', encoding="utf-8") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) > 0 and len(fields) % 2 == 1 word_id = int(fields[0]) From 4767c7ce0aef8db9d2e4bdd708773fc84ef1cf0b Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Fri, 8 Mar 2019 22:03:50 +0530 Subject: [PATCH 22/49] Update pocolm_cust.sh --- egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh index 422db15937a..0e71be29119 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh @@ -13,7 +13,7 @@ export PATH=$PATH:$POCOLM_ROOT/scripts wordlist=None num_word=100000 -pocolm_stage=2 +pocolm_stage=1 ngram_order=3 lm_dir= arpa_dir= From 2cd5948302c2f4c787a28d7fc96b700af8f525c3 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Fri, 8 Mar 2019 22:04:58 +0530 Subject: [PATCH 23/49] Update run.sh --- egs/fisher_callhome_spanish/s5_gigaword/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 7e488cdc5fa..b63b5208138 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -6,6 +6,7 @@ stage=-1 lmstage=-2 +num_words_pocolm=110000 train_sgmm2=false # call the next line with the directory where the Spanish Fisher data is @@ -96,7 +97,6 @@ if [ $stage -le 0 ]; then cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt fi -num_words_pocolm=110000 if [ $stage -le 1 ]; then local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ From 6595b429f3e743f779f8ef7f3e9f605bb6bd8105 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Mon, 18 Mar 2019 14:56:40 +0000 Subject: [PATCH 24/49] Added steps for generating POCOLM ARPA file --- .../s5_gigaword/local/train_pocolm.sh | 7 +++++-- egs/fisher_callhome_spanish/s5_gigaword/run.sh | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh index 964dd3bbcc5..b8b3ca35ef9 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh @@ -43,8 +43,11 @@ if [ $stage -le -1 ];then python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \ --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" - - + prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \ + "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" + mkdir -p "$pocolm_dir"/arpa + format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" | \ + gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz fi diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index b63b5208138..1ad8f9f1e0b 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -6,6 +6,7 @@ stage=-1 lmstage=-2 +addtraintext=true num_words_pocolm=110000 train_sgmm2=false @@ -95,6 +96,9 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt # For RNNLM and POCOLM training we use dev2/text as dev file. cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt + if $addtraintext; then + cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt + fi fi if [ $stage -le 1 ]; then From 0902c9e02c139cbf41d6d5c944957ee46a1bca6d Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Sun, 24 Mar 2019 10:52:19 +0530 Subject: [PATCH 25/49] Update run.sh --- egs/fisher_callhome_spanish/s5_gigaword/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 1ad8f9f1e0b..970a058a07f 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -32,9 +32,9 @@ if [ -f path.sh ]; then . ./path.sh; fi set -eou pipefail if [ $stage -le -1 ]; then -# local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts + local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts -# local/callhome_data_prep.sh $callhome_speech $callhome_transcripts + local/callhome_data_prep.sh $callhome_speech $callhome_transcripts # The lexicon is created using the LDC spanish lexicon, the words from the # fisher spanish corpus. Additional (most frequent) words are added from the From c10b0fe6d3a8be4e75fb31477acca179265c2ca4 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Sun, 24 Mar 2019 06:44:49 +0000 Subject: [PATCH 26/49] Apply g2p part added to get extended lexicon --- .../s5_gigaword/local/get_rnnlm_wordlist.py | 16 ++--- .../s5_gigaword/run.sh | 60 +++++++++++-------- egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh | 7 +-- 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py index d6ddfbecc14..fc13a7af701 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py @@ -1,17 +1,18 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # 2018 Saikiran Valluri, GoVivace inc. import os, sys -if len(sys.argv) < 4: - print( "Usage: python get_rnnlm_wordlist.py ") +if len(sys.argv) < 5: + print( "Usage: python get_rnnlm_wordlist.py ") sys.exit() -lexicon_words = open(sys.argv[1], 'r') -pocolm_words = open(sys.argv[2], 'r') -rnnlm_wordsout = open(sys.argv[3], 'w') +lexicon_words = open(sys.argv[1], 'r', encoding="utf-8") +pocolm_words = open(sys.argv[2], 'r', encoding="utf-8") +rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8") +oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8") line_count=0 lexicon=[] @@ -23,10 +24,11 @@ for line in pocolm_words: if not line.split()[0] in lexicon: + oov_wordlist.write(line.split()[0]+'\n') rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n') line_count = line_count + 1 lexicon_words.close() pocolm_words.close() rnnlm_wordsout.close() - +oov_wordlist.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 1ad8f9f1e0b..4abd34096ef 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -6,6 +6,7 @@ stage=-1 lmstage=-2 +train_rnnlm=true addtraintext=true num_words_pocolm=110000 train_sgmm2=false @@ -32,31 +33,23 @@ if [ -f path.sh ]; then . ./path.sh; fi set -eou pipefail if [ $stage -le -1 ]; then -# local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts + local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts -# local/callhome_data_prep.sh $callhome_speech $callhome_transcripts + local/callhome_data_prep.sh $callhome_speech $callhome_transcripts # The lexicon is created using the LDC spanish lexicon, the words from the # fisher spanish corpus. Additional (most frequent) words are added from the # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted # wordlist is downloaded if it is not available. local/fsp_prepare_dict.sh $spanish_lexicon + # Let's keep the original dict copy for G2P training + cp -r data/local/dict data/local/dict_orig ( - steps/dict/train_g2p_seq2seq.sh data/local/dict/lexicon.txt exp/g2p || touch exp/g2p/.error + steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error ) & # Added c,j, v to the non silences phones manually - utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang - - # Make sure that you do not use your test and your dev sets to train the LM - # Some form of cross validation is possible where you decode your dev/set based on an - # LM that is trained on everything but that that conversation - # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl - # to get the numbers. Depending on your needs, you might have to change the size of - # the splits within that file. The default paritions are based on the Kaldi + Joshua - # requirements which means that I have very large dev and test sets - local/fsp_train_lms.sh $split - local/fsp_create_test_lang.sh + utils/prepare_lang.sh data/local/dict_orig "" data/local/lang_orig data/lang_orig utils/fix_data_dir.sh data/local/data/train_all @@ -79,11 +72,7 @@ if [ $stage -le -1 ]; then local/create_splits.sh $split local/callhome_create_splits.sh $split_callhome - wait # wait till G2P training finishes - if [ -f exp/g2p/.error ]; then - rm exp/g2p/.error || true - echo "Fail to train the G2P model." && exit 1; - fi + fi if [ $stage -le 0 ]; then @@ -103,16 +92,37 @@ fi if [ $stage -le 1 ]; then local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm - local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ - "$rnnlm_workdir"/rnnlm_wordlist -fi - -if [ $stage -le 2 ]; then - local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \ + local/get_rnnlm_wordlist.py data/local/dict/lexicon.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ + "$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords + if $train_rnnlm; then + local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \ --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm + fi fi + if [ $stage -le 2 ]; then + wait # wait till G2P training finishes + if [ -f exp/g2p/.error ]; then + rm exp/g2p/.error || true + echo "Fail to train the G2P model." && exit 1; + fi + steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex + cat "$rnnlm_workdir"/oov_g2p.lex data/local/dict/lexicon.txt | sort -u > "$rnnlm_workdir"/lexicon_extended.txt + cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version. + + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + + # Make sure that you do not use your test and your dev sets to train the LM + # Some form of cross validation is possible where you decode your dev/set based on an + # LM that is trained on everything but that that conversation + # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl + # to get the numbers. Depending on your needs, you might have to change the size of + # the splits within that file. The default paritions are based on the Kaldi + Joshua + # requirements which means that I have very large dev and test sets + local/fsp_train_lms.sh $split + local/fsp_create_test_lang.sh + # Now compute CMVN stats for the train, dev and test subsets steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir diff --git a/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh index 77a08c305dd..e6e316ec6b1 100644 --- a/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh +++ b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh @@ -17,10 +17,9 @@ set -u set -e if [ $# != 3 ]; then - echo "Usage: $0 [options] " - echo " where is the training lexicon (one pronunciation per " - echo " word per line, with lines like 'hello h uh l ow') and" - echo " is directory where the models will be stored" + echo "Usage: $0 [options] " + echo " where is the OOV wordlist " + echo " is directory where the models will be stored" exit 1; fi From 3df45aec1d8f8a031eb8665c5c94e6be27e81803 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Sun, 24 Mar 2019 07:49:08 +0000 Subject: [PATCH 27/49] Small fix in run.sh rnnlm_wordlist --- egs/fisher_callhome_spanish/s5_gigaword/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 4abd34096ef..9d332cf06de 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -92,7 +92,7 @@ fi if [ $stage -le 1 ]; then local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm - local/get_rnnlm_wordlist.py data/local/dict/lexicon.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ + local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ "$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords if $train_rnnlm; then local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \ From 7e47695e793c113c385398dafb32f92572aec6f7 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Mon, 25 Mar 2019 06:28:24 +0000 Subject: [PATCH 28/49] Added sanity chack for Sparrowhawk normalizer in cleanup script --- .../s5_gigaword/local/clean_txt_dir.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh index 60269c0ab7e..1880b3a90cb 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh @@ -17,6 +17,12 @@ if [ $# -ne 2 ]; then exit 1; fi +if [ ! -s `which normalizer_main` ] ; then + echo "Sparrowhawk normalizer was not found installed !" + echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!" + exit 1 +fi + txtdir=$1 textdir=$(realpath $txtdir) outdir=$(realpath $2) @@ -38,7 +44,7 @@ if [ $stage -le 0 ]; then $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \ local/run_norm.sh \ sparrowhawk_configuration.ascii_proto \ - $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \ + $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \ $outdir/data \ JOB \ $outdir/sparrowhawk/ From 91a4611bba540c907b223c39b658bc5baca3a80f Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Mon, 25 Mar 2019 07:10:49 +0000 Subject: [PATCH 29/49] Data preparation fixes --- .../s5_gigaword/local/chain/run_tdnn_1g.sh | 7 ++++++- .../s5_gigaword/local/fsp_data_prep.sh | 1 + egs/fisher_callhome_spanish/s5_gigaword/run.sh | 8 +++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh index c487f1bd222..08e378cf8c5 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh @@ -27,9 +27,10 @@ nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=1g #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. common_egs_dir= reporting_email= +gigaword_workdir= # LSTM/chain options -train_stage=-10 +train_stage=-20 xent_regularize=0.1 dropout_schedule='0,0@0.20,0.3@0.50,0' @@ -277,6 +278,10 @@ if [ $stage -le 23 ]; then --online-ivector-dir exp/nnet3/ivectors_${data}_hires \ $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1; done + if [ $gigaword_workdir ]; then + bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \ + ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1; + fi bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \ ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1; ) || touch $dir/.error & diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh index 11d65da3e95..22b98a6c9db 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh @@ -133,6 +133,7 @@ if [ $stage -le 2 ]; then sed 's:::g' | \ sed 's:foreign>::g' | \ + sed 's:\[noise\]:[noise] :g' | \ sed 's:>::g' | \ #How do you handle numbers? grep -v '()' | \ diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 9d332cf06de..687fcfdf3c1 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -23,7 +23,7 @@ callhome_transcripts=/export/corpora/LDC/LDC96T17 split_callhome=local/splits/split_callhome gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data -rnnlm_workdir=/export/c03/svalluri/workdir_pocolm_2stage +rnnlm_workdir=workdir_rnnlm_Spanish_08032019 mfccdir=`pwd`/mfcc . ./cmd.sh @@ -75,6 +75,7 @@ if [ $stage -le -1 ]; then fi + if [ $stage -le 0 ]; then mkdir -p "$rnnlm_workdir"/gigaword_rawtext local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath" "$rnnlm_workdir"/flattened_gigaword_corpus 24 @@ -90,6 +91,7 @@ if [ $stage -le 0 ]; then fi fi + if [ $stage -le 1 ]; then local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ @@ -108,7 +110,7 @@ if [ $stage -le 2 ]; then echo "Fail to train the G2P model." && exit 1; fi steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex - cat "$rnnlm_workdir"/oov_g2p.lex data/local/dict/lexicon.txt | sort -u > "$rnnlm_workdir"/lexicon_extended.txt + cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sort | uniq | sed "/^$/d" > "$rnnlm_workdir"/lexicon_extended.txt cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version. utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang @@ -294,6 +296,6 @@ fi wait; if [ $stage -le 6 ]; then - local/chain/run_tdnn_1g.sh || exit 1; + local/chain/run_tdnn_1g.sh --gigaword-workdir $rnnlm_workdir || exit 1; fi exit 0; From 5f45dd17453dc3eb2424b35d78e1ed3eb20a5a2c Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Tue, 26 Mar 2019 08:02:39 -0400 Subject: [PATCH 30/49] Cosmetic options for gigaword textclean --- .../s5_gigaword/path.sh | 6 +++-- .../s5_gigaword/run.sh | 23 +++++++++++-------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh index e622e7d5051..2993311fd90 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh @@ -7,5 +7,7 @@ export LD_LIBRARY_PATH=/home/dpovey/libs export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk export PATH=$SPARROWHAWK_ROOT/bin:$PATH -export LC_ALL=C.UTF-8 -export LANG=C.UTF-8 +export LC_ALL=C +export LANG=C + +source ~/anaconda/bin/activate py36 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index 687fcfdf3c1..e1c43d24902 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -6,7 +6,8 @@ stage=-1 lmstage=-2 -train_rnnlm=true +train_rnnlm=false +start_textcleanup=false addtraintext=true num_words_pocolm=110000 train_sgmm2=false @@ -14,7 +15,7 @@ train_sgmm2=false # call the next line with the directory where the Spanish Fisher data is # (the values below are just an example). sfisher_speech=/export/corpora/LDC/LDC2010S01 -sfisher_transcripts=/export/corpora/LDC/LDC2010T04 +sfisher_transcripts=/export/c03/svalluri//LDC2010T04 spanish_lexicon=/export/corpora/LDC/LDC96L16 split=local/splits/split_fisher @@ -44,9 +45,9 @@ if [ $stage -le -1 ]; then local/fsp_prepare_dict.sh $spanish_lexicon # Let's keep the original dict copy for G2P training cp -r data/local/dict data/local/dict_orig - ( - steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error - ) & +# ( +# steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error +# ) & # Added c,j, v to the non silences phones manually utils/prepare_lang.sh data/local/dict_orig "" data/local/lang_orig data/lang_orig @@ -75,8 +76,12 @@ if [ $stage -le -1 ]; then fi +if $start_textcleanup; then + echo "WARNING : Starting from cleaning up and normalizing the Gigword text" + echo " This might take few days........... You can opt out this stage " + echo " by setting start_textcleanup=false, and having text_lm ready inside rnnlm_workdir." -if [ $stage -le 0 ]; then + if [ $stage -le 0 ]; then mkdir -p "$rnnlm_workdir"/gigaword_rawtext local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath" "$rnnlm_workdir"/flattened_gigaword_corpus 24 cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt @@ -89,9 +94,9 @@ if [ $stage -le 0 ]; then if $addtraintext; then cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt fi + fi fi - if [ $stage -le 1 ]; then local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ @@ -110,7 +115,7 @@ if [ $stage -le 2 ]; then echo "Fail to train the G2P model." && exit 1; fi steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex - cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sort | uniq | sed "/^$/d" > "$rnnlm_workdir"/lexicon_extended.txt + cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^$/d" |sort | uniq > "$rnnlm_workdir"/lexicon_extended.txt cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version. utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang @@ -296,6 +301,6 @@ fi wait; if [ $stage -le 6 ]; then - local/chain/run_tdnn_1g.sh --gigaword-workdir $rnnlm_workdir || exit 1; + local/chain/run_tdnn_1g.sh --stage 9 --gigaword-workdir $rnnlm_workdir || exit 1; fi exit 0; From e711d30f7bb77c3c5fa1e766de1896d1559bd3a1 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Mon, 1 Apr 2019 07:16:17 -0400 Subject: [PATCH 31/49] Some fixes in rnnlm training --- .../s5_gigaword/local/chain/run_tdnn_1g.sh | 9 +++++---- egs/fisher_callhome_spanish/s5_gigaword/run.sh | 16 ++++++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh index 08e378cf8c5..2f478419a18 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh @@ -202,7 +202,7 @@ fi if [ $stage -le 20 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + if [[ $(hostname -f) == *.clsp.joujhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi @@ -255,9 +255,10 @@ if [ $stage -le 21 ]; then fi +# Let's train first a small RNNLM on Fisher train set rnnlmdir=exp/rnnlm_lstm_tdnn_1b if [ $stage -le 22 ]; then - local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; + rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; fi if [ $stage -le 23 ]; then @@ -279,10 +280,10 @@ if [ $stage -le 23 ]; then $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1; done if [ $gigaword_workdir ]; then - bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \ + bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \ ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1; fi - bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \ + bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \ ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1; ) || touch $dir/.error & done diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh index e1c43d24902..95425c29034 100755 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh @@ -7,8 +7,12 @@ stage=-1 lmstage=-2 train_rnnlm=false -start_textcleanup=false -addtraintext=true +start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder. + # If you already have the normalised gigword text somewhere, you can bypass the + # time consuming text cleanup (~1 week) by setting this option false. +addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to + # perform the A, A + G, Dev type POCOLM training configuration. + # A=fsp train, G=gigword text, num_words_pocolm=110000 train_sgmm2=false @@ -45,9 +49,9 @@ if [ $stage -le -1 ]; then local/fsp_prepare_dict.sh $spanish_lexicon # Let's keep the original dict copy for G2P training cp -r data/local/dict data/local/dict_orig -# ( -# steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error -# ) & + ( + steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error + ) & # Added c,j, v to the non silences phones manually utils/prepare_lang.sh data/local/dict_orig "" data/local/lang_orig data/lang_orig @@ -301,6 +305,6 @@ fi wait; if [ $stage -le 6 ]; then - local/chain/run_tdnn_1g.sh --stage 9 --gigaword-workdir $rnnlm_workdir || exit 1; + local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1; fi exit 0; From 8d521c694f0809cfb058568123fc8355406d1b78 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Mon, 1 Apr 2019 07:18:06 -0400 Subject: [PATCH 32/49] Moved s5_gigaword directory to s5 --- egs/fisher_callhome_spanish/s5/RESULTS | 38 ------ egs/fisher_callhome_spanish/s5/cmd.sh | 4 +- .../s5/local/chain/run_tdnn_1g.sh | 16 ++- .../s5/local/clean_abbrevs_text.py | 35 +++++ .../s5/local/clean_txt_dir.sh | 57 +++++++++ egs/fisher_callhome_spanish/s5/local/ctm.sh | 6 +- .../flatten_gigaword/flatten_all_gigaword.sh | 15 +++ .../flatten_gigaword/flatten_one_gigaword.py | 61 +++++++++ .../s5/local/flatten_gigaword/run_flat.sh | 17 +++ .../s5/local/fsp_data_prep.sh | 1 + .../s5/local/fsp_prepare_dict.sh | 5 +- .../s5/local/get_data_weights.pl | 39 ++++++ .../s5/local/get_rnnlm_wordlist.py | 34 +++++ .../s5/local/get_unigram_weights_vocab.py | 33 +++++ .../s5/local/merge_lexicons.py | 7 +- .../s5/local/pocolm_cust.sh | 120 +++++++++++++++++ egs/fisher_callhome_spanish/s5/local/rnnlm.sh | 83 ++++++++++++ .../s5/local/rnnlm/train_rnnlm.sh | 101 --------------- .../s5/local/run_norm.sh | 36 ++++++ .../s5/local/train_pocolm.sh | 54 ++++++++ egs/fisher_callhome_spanish/s5/path.sh | 11 +- egs/fisher_callhome_spanish/s5/run.sh | 121 ++++++++++++------ egs/fisher_callhome_spanish/s5/steps | 2 +- egs/fisher_callhome_spanish/s5/utils | 2 +- 24 files changed, 699 insertions(+), 199 deletions(-) delete mode 100644 egs/fisher_callhome_spanish/s5/RESULTS create mode 100644 egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py create mode 100755 egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh create mode 100755 egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh create mode 100644 egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py create mode 100755 egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh create mode 100755 egs/fisher_callhome_spanish/s5/local/get_data_weights.pl create mode 100755 egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py create mode 100644 egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py create mode 100755 egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh create mode 100755 egs/fisher_callhome_spanish/s5/local/rnnlm.sh delete mode 100755 egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh create mode 100755 egs/fisher_callhome_spanish/s5/local/run_norm.sh create mode 100755 egs/fisher_callhome_spanish/s5/local/train_pocolm.sh diff --git a/egs/fisher_callhome_spanish/s5/RESULTS b/egs/fisher_callhome_spanish/s5/RESULTS deleted file mode 100644 index 66613163cea..00000000000 --- a/egs/fisher_callhome_spanish/s5/RESULTS +++ /dev/null @@ -1,38 +0,0 @@ --------------------------------------------------------------------------------------- -Triphone with mono alignment (small) --------------------------------------------------------------------------------------- -%WER 53.70 [ 21570 / 40170, 2618 ins, 6013 del, 12939 sub ] exp/tri1/decode_dev/wer_14_0.0 - --------------------------------------------------------------------------------------- -Triphone with tri alignments --------------------------------------------------------------------------------------- -%WER 53.18 [ 21364 / 40170, 2889 ins, 5533 del, 12942 sub ] exp/tri2/decode_dev/wer_13_0.0 - --------------------------------------------------------------------------------------- -Triphone + LDA + MLLT --------------------------------------------------------------------------------------- -%WER 46.95 [ 18858 / 40170, 2636 ins, 5197 del, 11025 sub ] exp/tri3a/decode_dev/wer_14_0.0 - --------------------------------------------------------------------------------------- -+ SAT + fMLLR --------------------------------------------------------------------------------------- -%WER 42.86 [ 17217 / 40170, 2556 ins, 4633 del, 10028 sub ] exp/tri4a/decode_dev/wer_15_0.0 - --------------------------------------------------------------------------------------- -+ More leaves and gaussians --------------------------------------------------------------------------------------- -%WER 40.48 [ 16261 / 40170, 2689 ins, 4130 del, 9442 sub ] exp/tri5a/decode_dev/wer_14_0.0 - --------------------------------------------------------------------------------------- -+ bMMI + SGMM --------------------------------------------------------------------------------------- -%WER 38.43 [ 15437 / 40170, 2800 ins, 3685 del, 8952 sub ] exp/sgmm5/decode_dev/wer_10_0.0 -%WER 36.90 [ 14821 / 40170, 2708 ins, 3552 del, 8561 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it1/wer_10_0.0 -%WER 36.09 [ 14499 / 40170, 2511 ins, 3737 del, 8251 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it2/wer_11_0.0 -%WER 35.48 [ 14252 / 40170, 2672 ins, 3370 del, 8210 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it3/wer_10_0.0 -%WER 35.16 [ 14122 / 40170, 2701 ins, 3287 del, 8134 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it4/wer_10_0.0 - --------------------------------------------------------------------------------------- -pNorm-Ensemble DNN --------------------------------------------------------------------------------------- -%WER 35.13 [ 14113 / 40170, 2680 ins, 3405 del, 8028 sub ] exp/tri6a_dnn/decode_dev/wer_11_0.0 diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh index 88db78823a5..db97f1fbc6f 100755 --- a/egs/fisher_callhome_spanish/s5/cmd.sh +++ b/egs/fisher_callhome_spanish/s5/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="queue.pl --mem 4G" -export decode_cmd="queue.pl --mem 4G" +export train_cmd="retry.pl queue.pl --mem 8G" +export decode_cmd="retry.pl queue.pl --mem 8G" export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh index 7f407552c2e..2f478419a18 100755 --- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh +++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh @@ -27,9 +27,10 @@ nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=1g #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. common_egs_dir= reporting_email= +gigaword_workdir= # LSTM/chain options -train_stage=-10 +train_stage=-20 xent_regularize=0.1 dropout_schedule='0,0@0.20,0.3@0.50,0' @@ -156,7 +157,7 @@ if [ $stage -le 19 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" @@ -201,7 +202,7 @@ fi if [ $stage -le 20 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + if [[ $(hostname -f) == *.clsp.joujhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi @@ -254,9 +255,10 @@ if [ $stage -le 21 ]; then fi +# Let's train first a small RNNLM on Fisher train set rnnlmdir=exp/rnnlm_lstm_tdnn_1b if [ $stage -le 22 ]; then - local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; + rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; fi if [ $stage -le 23 ]; then @@ -277,7 +279,11 @@ if [ $stage -le 23 ]; then --online-ivector-dir exp/nnet3/ivectors_${data}_hires \ $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1; done - bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \ + if [ $gigaword_workdir ]; then + bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \ + ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1; + fi + bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \ ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1; ) || touch $dir/.error & done diff --git a/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py new file mode 100644 index 00000000000..7d92eb9fe3a --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc., + +import os, sys +import re +import codecs + +if len(sys.argv) < 3: + print("Usage : python clean_abbrevs_text.py ") + print(" Processes the text before text normalisation to convert uppercase words as space separated letters") + sys.exit() + +inputfile=codecs.open(sys.argv[1], encoding='utf-8') +outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w') + +for line in inputfile: + words = line.split() + textout = "" + wordcnt = 0 + for word in words: + if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word): + if wordcnt > 0: + word = re.sub('\'?s', 's', word) + textout = textout + " ".join(word) + " " + else: + textout = textout + word + " " + else: + textout = textout + word + " " + if word.isalpha(): wordcnt = wordcnt + 1 + outputfile.write(textout.strip()+ '\n') + +inputfile.close() +outputfile.close() diff --git a/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh new file mode 100755 index 00000000000..1880b3a90cb --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Script to clean up gigaword LM text +# Removes punctuations, does case normalization + +stage=0 +nj=500 + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +set -euo pipefail + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1; +fi + +if [ ! -s `which normalizer_main` ] ; then + echo "Sparrowhawk normalizer was not found installed !" + echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!" + exit 1 +fi + +txtdir=$1 +textdir=$(realpath $txtdir) +outdir=$(realpath $2) + +workdir=$outdir/tmp +if [ $stage -le 0 ]; then + rm -rf $outdir + mkdir -p $workdir + mkdir -p $textdir/splits + mkdir -p $outdir/data + split -l 1000000 $textdir/in.txt $textdir/splits/out + numsplits=0 + for x in $textdir/splits/*; do + numsplits=$((numsplits+1)) + ln -s $x $outdir/data/$numsplits + done + echo $numsplits + cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt . + $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \ + local/run_norm.sh \ + sparrowhawk_configuration.ascii_proto \ + $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \ + $outdir/data \ + JOB \ + $outdir/sparrowhawk/ + cat $outdir/sparrowhawk/*.txt | sed "/^$/d" > $outdir/text_normalized + + # check if numbers are there in normalized output + awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \ + $outdir/text_normalized > $outdir/unique_words + grep "[0-9]" $outdir/unique_words | sort -u > $outdir/numbers +fi diff --git a/egs/fisher_callhome_spanish/s5/local/ctm.sh b/egs/fisher_callhome_spanish/s5/local/ctm.sh index 62860a10b7b..7d09f574580 100755 --- a/egs/fisher_callhome_spanish/s5/local/ctm.sh +++ b/egs/fisher_callhome_spanish/s5/local/ctm.sh @@ -19,9 +19,9 @@ fi steps/get_ctm.sh $data_dir $lang_dir $decode_dir # Make sure that channel markers match -#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm -#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {} -#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {} +#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm +#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {} +#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {} # Get the environment variables . /export/babel/data/software/env.sh diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh new file mode 100755 index 00000000000..242359e7c28 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -e + +# Path to Gigaword corpus with all data files decompressed. +export GIGAWORDDIR=$1 +# The directory to write output to +export OUTPUTDIR=$2 +# The number of jobs to run at once +export NUMJOBS=$3 + +echo "Flattening Gigaword with ${NUMJOBS} processes..." +mkdir -p $OUTPUTDIR +find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \; +echo "Combining the flattened files into one..." +cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py new file mode 100644 index 00000000000..29f6766dd84 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +import logging +import os +import re +import spacy +import gzip + +from argparse import ArgumentParser +from bs4 import BeautifulSoup + +en_nlp = spacy.load("es") + + +def flatten_one_gigaword_file(file_path): + f = gzip.open(file_path) + html = f.read() + # Parse the text with BeautifulSoup + soup = BeautifulSoup(html, "html.parser") + + # Iterate over all

items and get the text for each. + all_paragraphs = [] + for paragraph in soup("p"): + # Turn inter-paragraph newlines into spaces + paragraph = paragraph.get_text() + paragraph = re.sub(r"\n+", "\n", paragraph) + paragraph = paragraph.replace("\n", " ") + # Tokenize the paragraph into words + tokens = en_nlp.tokenizer(paragraph) + words = [str(token) for token in tokens if not + str(token).isspace()] + if len(words) < 3: + continue + all_paragraphs.append(words) + # Return a list of strings, where each string is a + # space-tokenized paragraph. + return [" ".join(paragraph) for paragraph in all_paragraphs] + + +if __name__ == "__main__": + log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + logging.basicConfig(level=logging.INFO, format=log_fmt) + logger = logging.getLogger(__name__) + + parser = ArgumentParser(description=("Flatten a gigaword data file for " + "use in language modeling.")) + parser.add_argument("--gigaword-path", required=True, + metavar="", type=str, + help=("Path to Gigaword directory, with " + "all .gz files unzipped.")) + parser.add_argument("--output-dir", required=True, metavar="", + type=str, help=("Directory to write final flattened " + "Gigaword file.")) + + A = parser.parse_args() + all_paragraphs = flatten_one_gigaword_file(A.gigaword_path) + output_path = os.path.join(A.output_dir, + os.path.basename(A.gigaword_path) + ".flat") + with open(output_path, "w") as output_file: + for paragraph in all_paragraphs: + output_file.write("{}\n".format(paragraph)) diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh new file mode 100755 index 00000000000..6b236be0ab9 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -e + +. ./path_venv.sh + +# Path to Gigaword corpus with all data files decompressed. +GIGAWORDPATH=$1 +# The directory to write output to +OUTPUTDIR=$2 +file=$(basename ${GIGAWORDPATH}) +if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then + echo "flattening to ${OUTPUTDIR}/${file}.flat" + python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR} +else + echo "skipping ${file}.flat" +fi + diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh index 11d65da3e95..22b98a6c9db 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh @@ -133,6 +133,7 @@ if [ $stage -le 2 ]; then sed 's:::g' | \ sed 's:foreign>::g' | \ + sed 's:\[noise\]:[noise] :g' | \ sed 's:>::g' | \ #How do you handle numbers? grep -v '()' | \ diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh index 779298305c4..7b2de2db392 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh @@ -105,8 +105,9 @@ if [ $stage -le 4 ]; then cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2" # Add prons for laughter, noise, oov - w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|') - perl -i -ne "print unless /\[(${w%?})\]/" $tmpdir/lexicon.2 + for w in `grep -v sil $dir/silence_phones.txt`; do + sed -i "/\[$w\]/d" $tmpdir/lexicon.2 + done for w in `grep -v sil $dir/silence_phones.txt`; do echo "[$w] $w" diff --git a/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl new file mode 100755 index 00000000000..ca5b2a46f8e --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl @@ -0,0 +1,39 @@ +#!/usr/bin/env perl + +# Nagendra Kumar Goel + +# This takes two arguments: +# 1) Pocolm training output folder +# 2) rnnlm weights file name (for output) + +use POSIX; +use List::Util qw[min max]; + +if (@ARGV != 2) { + die "Usage: get_data_weights.pl \n"; +} + +$pdir = shift @ARGV; +$out = shift @ARGV; + +open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters"; +open(N, "<$pdir/names") || die "Could not open $pdir/names" ; +open(O, ">$out") || die "Could not open $out for writing" ; + +my %scores = (); + +while() { + @n = split(/\s/,$_); + $name = $n[1]; + $w =

; + @w = split(/\s/,$w); + $weight = $w[1]; + $scores{$name} = $weight; +} + +$min = min(values %scores); + +for(keys %scores) { + $weightout = POSIX::ceil($scores{$_} / $min); + print O "$_\t1\t$weightout\n"; +} diff --git a/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py new file mode 100755 index 00000000000..fc13a7af701 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc. + +import os, sys + +if len(sys.argv) < 5: + print( "Usage: python get_rnnlm_wordlist.py ") + sys.exit() + +lexicon_words = open(sys.argv[1], 'r', encoding="utf-8") +pocolm_words = open(sys.argv[2], 'r', encoding="utf-8") +rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8") +oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8") + +line_count=0 +lexicon=[] + +for line in lexicon_words: + lexicon.append(line.split()[0]) + rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n') + line_count = line_count + 1 + +for line in pocolm_words: + if not line.split()[0] in lexicon: + oov_wordlist.write(line.split()[0]+'\n') + rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n') + line_count = line_count + 1 + +lexicon_words.close() +pocolm_words.close() +rnnlm_wordsout.close() +oov_wordlist.close() diff --git a/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py new file mode 100644 index 00000000000..3ecd16772d7 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# 2018 Saikiran Valluri, GoVivace inc. + +import os, sys + +if len(sys.argv) < 3: + print("Usage : python . ") + print(" Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.") + sys.exit() + +pocolmdir=sys.argv[1] +unigramwts=open(sys.argv[2], 'w') + +names = open(pocolmdir+"/names", 'r') +metaparams = open(pocolmdir+"/metaparameters", 'r') + +name_mapper={} +for line in names: + fields=line.split() + name_mapper[fields[0]] = fields[1] + +lns = metaparams.readlines() +for lineno in range(len(name_mapper.keys())): + line = lns[lineno] + fileid = line.split()[0].split("_")[-1] + weight = line.split()[1] + unigramwts.write(name_mapper[fileid] + " " + weight + "\n") + +names.close() +unigramwts.close() +metaparams.close() diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py index b42eb52d20a..94546dc44c3 100755 --- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py +++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py @@ -1,11 +1,12 @@ -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# 2018 Saikiran Valluri, GoVivace inc., Avaaya #!/usr/bin/env python # -*- coding: utf-8 -*- # +# 2018 Saikiran Valluri, GoVivace inc., Avaaya + # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon from __future__ import print_function -import sys, re +import sys +import re import json import codecs import operator diff --git a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh new file mode 100755 index 00000000000..0e71be29119 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash + +# this script generates Pocolm-estimated language models with various +# data sources in data/text folder and places the output in data/lm. + +set -euo pipefail + +. ./path.sh + +export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P) +export PATH=$PATH:$POCOLM_ROOT/scripts + + +wordlist=None +num_word=100000 +pocolm_stage=1 +ngram_order=3 +lm_dir= +arpa_dir= +textdir= +max_memory='--max-memory=8G' + +. ./cmd.sh +. ./utils/parse_options.sh + + +# If you do not want to set memory limitation for "sort", you can use +#max_memory= +# Choices for the max-memory can be: +# 1) integer + 'K', 'M', 'G', ... +# 2) integer + 'b', meaning unit is byte and no multiplication +# 3) integer + '%', meaning a percentage of memory +# 4) integer, default unit is 'K' + +fold_dev_opt= +# If you want to fold the dev-set in to the 'swbd1' set to produce the final +# model, un-comment the following line. For use in the Kaldi example script for +# ASR, this isn't suitable because the 'dev' set is the first 10k lines of the +# switchboard data, which we also use as dev data for speech recognition +# purposes. +#fold_dev_opt="--fold-dev-into=swbd1" + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 3-gram model running with train_lm.py. +# the dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.091,0.867,0.753,0.275,0.100,0.018,0.902,0.371,0.183,0.070" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +limit_unk_history_opt= +# If you want to limit the left of in the history of a n-gram +# un-comment the following line +#limit_unk_history_opt="--limit-unk-history=true" + +for order in ${ngram_order}; do + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + lm_name="${num_word}_${order}" + min_counts='' + # Note: the following might be a more reasonable setting: + # min_counts='fisher=2 swbd1=1' + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \ + --min-counts=${min_counts} \ + --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \ + ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir} + + if [ $pocolm_stage -eq 2 ];then + mkdir -p ${arpa_dir} + format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz + + # example of pruning. note: the threshold can be less than or more than one. + get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + for threshold in 1.0 2.0 4.0; do + pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm + prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3 + get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz + + done + + # example of pruning by size. + size=1000000 + pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm + prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes' + get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz + fi +done + +# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 ) + +# the following does does some self-testing, including +# that the computed derivatives are accurate. +# local/self_test.sh + +# perplexities from pocolm-estimated language models with pocolm's interpolation +# method from orders 3, 4, and 5 are: +# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689) +# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797) +# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181) + +# note, the perplexities from pocolm-estimated language models with SRILM's +# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh), +# 78.8449 and 75.2202 respectively. + +# note, the perplexities from SRILM-estimated language models with SRILM's +# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh), +# 78.9056 and 75.5528 respectively. diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh new file mode 100755 index 00000000000..3850910f312 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) +# 2015 Guoguo Chen +# 2017 Hainan Xu +# 2017 Xiaohui Zhang + +# This script trains LMs on the swbd LM-training data. + +# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0. +# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71 +# Dev objf: -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91 + + +dir=Spanish_gigawrd/rnnlm +pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned +wordslist= +embedding_dim=1024 +lstm_rpd=256 +lstm_nrpd=256 +stage=0 +train_stage=-30 +text_dir=Spanish_gigawrd/text_lm + +. ./cmd.sh +. ./utils/parse_options.sh + +mkdir -p $dir/config +set -e + +for f in $text_dir/dev.txt; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist;" && exit 1 +done + +if [ $stage -le 0 ]; then + if [ -f $text_dir/unigram_weights ] ; then + mv $text_dir/unigram_weights $pocolm_dir/ + fi + cp $wordslist $dir/config/words.txt + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt + rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \ + --unk-word="" \ + --data-weights-file=$dir/config/data_weights.txt \ + $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --special-words=',,,,[noise],[laughter]' \ + $dir/config/words.txt > $dir/config/features.txt +fi + +if [ $stage -le 1 ]; then + cat <$dir/config/xconfig + input dim=$embedding_dim name=input + relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1)) + fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd + relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3)) + fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd + relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3)) + output-layer name=output include-log-softmax=false dim=$embedding_dim +EOF + rnnlm/validate_config_dir.sh $text_dir $dir/config +fi + +if [ $stage -le 2 ]; then + rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir +fi + +if [ $stage -le 3 ]; then + rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \ + --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir +fi + +exit 0 diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh deleted file mode 100755 index 3713fe228d6..00000000000 --- a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson -# 2017 Hainan Xu -# 2017 Ke Li - -# This script is similar to rnnlm_lstm_tdnn_a.sh except for adding L2 regularization. - -# local/rnnlm/train_rnnlm.sh: best iteration (out of 18) was 17, linking it to final iteration. -# local/rnnlm/train_rnnlm.sh: train/dev perplexity was 45.6 / 68.7. -# Train objf: -651.50 -4.44 -4.26 -4.15 -4.08 -4.03 -4.00 -3.97 -3.94 -3.92 -3.90 -3.89 -3.88 -3.86 -3.85 -3.84 -3.83 -3.82 -# Dev objf: -10.76 -4.68 -4.47 -4.38 -4.33 -4.29 -4.28 -4.27 -4.26 -4.26 -4.25 -4.24 -4.24 -4.24 -4.23 -4.23 -4.23 -4.23 - -# Begin configuration section. -dir=exp/rnnlm_lstm_tdnn_1b -embedding_dim=200 -embedding_l2=0.005 # embedding layer l2 regularize -comp_l2=0.005 # component-level l2 regularize -output_l2=0.005 # output-layer l2 regularize -epochs=90 -mic= -stage=-10 -train_stage=0 - -. ./cmd.sh -. ./utils/parse_options.sh -[ -z "$cmd" ] && cmd=$train_cmd - -train=data/train/text -dev=data/dev2/text # We at no stage in run.sh should decode dev2 partition for results! -wordlist=data/lang/words.txt -text_dir=data/local/rnnlm/text -mkdir -p $dir/config -set -e - -for f in $train $dev $wordlist; do - [ ! -f $f ] && \ - echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1 -done - -if [ $stage -le 0 ]; then - mkdir -p $text_dir - cat $train | cut -d ' ' -f2- > $text_dir/ami.txt - cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt -fi - -if [ $stage -le 1 ]; then - cp $wordlist $dir/config/ - n=`cat $dir/config/words.txt | wc -l` - echo " $n" >> $dir/config/words.txt - - # words that are not present in words.txt but are in the training or dev data, will be - # mapped to during training. - echo "" >$dir/config/oov.txt - - cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt - - # choose features - rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ - --use-constant-feature=true \ - --top-word-features 10000 \ - --min-frequency 1.0e-03 \ - --special-words=',,,,[noise],[laughter]' \ - $dir/config/words.txt > $dir/config/features.txt - -lstm_opts="l2-regularize=$comp_l2" -tdnn_opts="l2-regularize=$comp_l2" -output_opts="l2-regularize=$output_l2" - - cat >$dir/config/xconfig < $dir/normalize/$job/substitute.sh + +bash $dir/normalize/$job/substitute.sh | \ + sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \ + sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' > $dir/normalize/$job/text +normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt + +exit 0; diff --git a/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh new file mode 100755 index 00000000000..b8b3ca35ef9 --- /dev/null +++ b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +stage=-2 +num_words_pocolm=110000 +prune_size=1000000 + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +set -euo pipefail + +export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P) +export PATH=$PATH:$POCOLM_ROOT/scripts + +textdir=$1 +pocolm_dir=$2 + + +if [ $stage -le -2 ]; then + echo "****" + echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model" + echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....." + echo "****" + if [ -e "$textdir"/unigram_weights ]; then + rm "$textdir"/unigram_weights + fi + if [ -e "$pocolm_dir" ]; then + rm -r "$pocolm_dir" + fi + + bash local/pocolm_cust.sh --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \ + --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" + +fi + +if [ $stage -le -1 ];then + echo "********" + echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....." + echo "********" + + echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done + python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights + bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \ + --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" + prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \ + "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" + mkdir -p "$pocolm_dir"/arpa + format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" | \ + gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz +fi + + +exit 0; diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh index 17ffb0369f8..2993311fd90 100755 --- a/egs/fisher_callhome_spanish/s5/path.sh +++ b/egs/fisher_callhome_spanish/s5/path.sh @@ -1,6 +1,13 @@ -export KALDI_ROOT=`pwd`/../../.. +export KALDI_ROOT=`pwd`/../../../ +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh +export LD_LIBRARY_PATH=/home/dpovey/libs + +export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk +export PATH=$SPARROWHAWK_ROOT/bin:$PATH export LC_ALL=C -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs +export LANG=C + +source ~/anaconda/bin/activate py36 diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 6e2752a7b68..95425c29034 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -4,14 +4,22 @@ # Copyright 2014 Gaurav Kumar. Apache 2.0 # Recipe for Fisher/Callhome-Spanish -stage=0 -train_stage=-20 +stage=-1 +lmstage=-2 +train_rnnlm=false +start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder. + # If you already have the normalised gigword text somewhere, you can bypass the + # time consuming text cleanup (~1 week) by setting this option false. +addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to + # perform the A, A + G, Dev type POCOLM training configuration. + # A=fsp train, G=gigword text, +num_words_pocolm=110000 train_sgmm2=false # call the next line with the directory where the Spanish Fisher data is # (the values below are just an example). sfisher_speech=/export/corpora/LDC/LDC2010S01 -sfisher_transcripts=/export/corpora/LDC/LDC2010T04 +sfisher_transcripts=/export/c03/svalluri//LDC2010T04 spanish_lexicon=/export/corpora/LDC/LDC96L16 split=local/splits/split_fisher @@ -19,15 +27,17 @@ callhome_speech=/export/corpora/LDC/LDC96S35 callhome_transcripts=/export/corpora/LDC/LDC96T17 split_callhome=local/splits/split_callhome +gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data +rnnlm_workdir=workdir_rnnlm_Spanish_08032019 mfccdir=`pwd`/mfcc . ./cmd.sh if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; -set -e +set -eou pipefail -if [ $stage -le 1 ]; then +if [ $stage -le -1 ]; then local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts local/callhome_data_prep.sh $callhome_speech $callhome_transcripts @@ -37,19 +47,14 @@ if [ $stage -le 1 ]; then # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted # wordlist is downloaded if it is not available. local/fsp_prepare_dict.sh $spanish_lexicon + # Let's keep the original dict copy for G2P training + cp -r data/local/dict data/local/dict_orig + ( + steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error + ) & # Added c,j, v to the non silences phones manually - utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang - - # Make sure that you do not use your test and your dev sets to train the LM - # Some form of cross validation is possible where you decode your dev/set based on an - # LM that is trained on everything but that that conversation - # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl - # to get the numbers. Depending on your needs, you might have to change the size of - # the splits within that file. The default paritions are based on the Kaldi + Joshua - # requirements which means that I have very large dev and test sets - local/fsp_train_lms.sh $split - local/fsp_create_test_lang.sh + utils/prepare_lang.sh data/local/dict_orig "" data/local/lang_orig data/lang_orig utils/fix_data_dir.sh data/local/data/train_all @@ -70,34 +75,65 @@ if [ $stage -le 1 ]; then cp -r data/local/data/callhome_train_all data/callhome_train_all - # Creating data partitions for the pipeline - # We need datasets for both the ASR and SMT system - # We have 257455 utterances left, so the partitions are roughly as follows - # ASR Train : 100k utterances - # ASR Tune : 17455 utterances - # ASR Eval : 20k utterances - # MT Train : 100k utterances - # MT Tune : Same as the ASR eval set (Use the lattices from here) - # MT Eval : 20k utterances - # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker - # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. - # As noted above, the LM has not been trained on the dev and the test sets. - #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test - #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test - #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test - #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev - #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test - #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train - #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test - #rm -r data/dev_and_test - #rm -r data/asr_dev_and_test - #rm -r data/mt_train_and_test - local/create_splits.sh $split local/callhome_create_splits.sh $split_callhome + fi +if $start_textcleanup; then + echo "WARNING : Starting from cleaning up and normalizing the Gigword text" + echo " This might take few days........... You can opt out this stage " + echo " by setting start_textcleanup=false, and having text_lm ready inside rnnlm_workdir." + + if [ $stage -le 0 ]; then + mkdir -p "$rnnlm_workdir"/gigaword_rawtext + local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath" "$rnnlm_workdir"/flattened_gigaword_corpus 24 + cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt + local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/ \ + "$rnnlm_workdir"/normalised_gigaword_corpus/ + mkdir -p "$rnnlm_workdir"/text_lm + cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt + cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt # For RNNLM and POCOLM training we use dev2/text as dev file. + cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt + if $addtraintext; then + cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt + fi + fi +fi + +if [ $stage -le 1 ]; then + local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm + local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ + "$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords + if $train_rnnlm; then + local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \ + --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm + fi +fi + + if [ $stage -le 2 ]; then + wait # wait till G2P training finishes + if [ -f exp/g2p/.error ]; then + rm exp/g2p/.error || true + echo "Fail to train the G2P model." && exit 1; + fi + steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex + cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^$/d" |sort | uniq > "$rnnlm_workdir"/lexicon_extended.txt + cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version. + + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + + # Make sure that you do not use your test and your dev sets to train the LM + # Some form of cross validation is possible where you decode your dev/set based on an + # LM that is trained on everything but that that conversation + # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl + # to get the numbers. Depending on your needs, you might have to change the size of + # the splits within that file. The default paritions are based on the Kaldi + Joshua + # requirements which means that I have very large dev and test sets + local/fsp_train_lms.sh $split + local/fsp_create_test_lang.sh + # Now compute CMVN stats for the train, dev and test subsets steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir @@ -264,8 +300,11 @@ for iter in 1 2 3 4; do data/lang_test data/dev/ exp/sgmm5/decode_dev $decode done ) & - fi -local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1; +wait; + +if [ $stage -le 6 ]; then + local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1; +fi exit 0; diff --git a/egs/fisher_callhome_spanish/s5/steps b/egs/fisher_callhome_spanish/s5/steps index 6e99bf5b5ad..1b186770dd1 120000 --- a/egs/fisher_callhome_spanish/s5/steps +++ b/egs/fisher_callhome_spanish/s5/steps @@ -1 +1 @@ -../../wsj/s5/steps \ No newline at end of file +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5/utils b/egs/fisher_callhome_spanish/s5/utils index b240885218f..a3279dc8679 120000 --- a/egs/fisher_callhome_spanish/s5/utils +++ b/egs/fisher_callhome_spanish/s5/utils @@ -1 +1 @@ -../../wsj/s5/utils \ No newline at end of file +../../wsj/s5/utils/ \ No newline at end of file From f61047074ffc0cf35afbe3535c29d5e19a4c3c9a Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Tue, 2 Apr 2019 05:44:59 -0400 Subject: [PATCH 33/49] removed s5_gigaword folder --- .../s5_gigaword/cmd.sh | 15 - .../s5_gigaword/conf/decode.config | 6 - .../s5_gigaword/conf/mfcc.conf | 2 - .../s5_gigaword/conf/mfcc_hires.conf | 10 - .../s5_gigaword/conf/online_cmvn.conf | 1 - .../s5_gigaword/conf/plp.conf | 2 - .../local/callhome_create_splits.sh | 31 - .../s5_gigaword/local/callhome_data_prep.sh | 163 ---- .../s5_gigaword/local/callhome_get_1_best.py | 75 -- .../local/callhome_get_lattices.py | 115 --- .../local/callhome_make_spk2gender.sh | 29 - .../s5_gigaword/local/callhome_make_trans.pl | 74 -- .../s5_gigaword/local/callhome_text_pp.sh | 9 - .../s5_gigaword/local/chain/run_tdnn_1g.sh | 294 ------- .../s5_gigaword/local/clean_abbrevs_text.py | 35 - .../s5_gigaword/local/clean_txt_dir.sh | 57 -- .../s5_gigaword/local/create_oracle_ctm.sh | 30 - .../s5_gigaword/local/create_splits.sh | 30 - .../s5_gigaword/local/ctm.sh | 34 - .../s5_gigaword/local/decode_report.py | 148 ---- .../s5_gigaword/local/find_unique_phones.pl | 25 - .../s5_gigaword/local/fix_stm.sh | 10 - .../flatten_gigaword/flatten_all_gigaword.sh | 15 - .../flatten_gigaword/flatten_one_gigaword.py | 61 -- .../local/flatten_gigaword/run_flat.sh | 17 - .../s5_gigaword/local/fsp_create_test_lang.sh | 49 -- .../s5_gigaword/local/fsp_data_prep.sh | 176 ---- .../local/fsp_ideal_data_partitions.pl | 85 -- .../s5_gigaword/local/fsp_make_spk2gender.sh | 29 - .../s5_gigaword/local/fsp_make_trans.pl | 81 -- .../s5_gigaword/local/fsp_prepare_dict.sh | 142 ---- .../s5_gigaword/local/fsp_train_lms.sh | 140 ---- .../s5_gigaword/local/get_1_best.py | 62 -- .../s5_gigaword/local/get_data_weights.pl | 39 - .../s5_gigaword/local/get_lattices.py | 115 --- .../s5_gigaword/local/get_oracle.sh | 32 - .../s5_gigaword/local/get_rnnlm_wordlist.py | 34 - .../local/get_unigram_weights_vocab.py | 33 - .../s5_gigaword/local/isolate_phones.pl | 66 -- .../s5_gigaword/local/latconvert.sh | 124 --- .../s5_gigaword/local/merge_lexicons.py | 65 -- .../s5_gigaword/local/monitor_denlats.sh | 31 - .../local/nnet3/run_ivector_common.sh | 187 ----- .../s5_gigaword/local/pocolm_cust.sh | 120 --- .../s5_gigaword/local/process_oracle.py | 64 -- .../s5_gigaword/local/rescore.sh | 24 - .../s5_gigaword/local/rnnlm.sh | 83 -- .../s5_gigaword/local/run_norm.sh | 36 - .../s5_gigaword/local/run_sgmm2x.sh | 57 -- .../s5_gigaword/local/score.sh | 1 - .../s5_gigaword/local/score_oracle.sh | 29 - .../s5_gigaword/local/splits/dev | 20 - .../local/splits/split_callhome/dev | 20 - .../local/splits/split_callhome/test | 20 - .../local/splits/split_callhome/train | 80 -- .../s5_gigaword/local/splits/split_fisher/dev | 20 - .../local/splits/split_fisher/dev2 | 20 - .../local/splits/split_fisher/test | 20 - .../local/splits/split_fisher/train | 759 ------------------ .../s5_gigaword/local/splits/test | 20 - .../s5_gigaword/local/splits/train | 80 -- .../s5_gigaword/local/spron.pl | 304 ------- .../s5_gigaword/local/subset_data_prep.sh | 164 ---- .../s5_gigaword/local/train_get_1_best.py | 79 -- .../s5_gigaword/local/train_get_lattices.py | 125 --- .../s5_gigaword/local/train_pocolm.sh | 54 -- .../s5_gigaword/local/train_process_oracle.py | 79 -- .../s5_gigaword/local/wer_output_filter | 5 - .../s5_gigaword/path.sh | 13 - egs/fisher_callhome_spanish/s5_gigaword/rnnlm | 1 - .../s5_gigaword/run.sh | 310 ------- egs/fisher_callhome_spanish/s5_gigaword/steps | 1 - egs/fisher_callhome_spanish/s5_gigaword/utils | 1 - 73 files changed, 5387 deletions(-) delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/cmd.sh delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh delete mode 120000 egs/fisher_callhome_spanish/s5_gigaword/local/score.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/test delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/train delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/path.sh delete mode 120000 egs/fisher_callhome_spanish/s5_gigaword/rnnlm delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/run.sh delete mode 120000 egs/fisher_callhome_spanish/s5_gigaword/steps delete mode 120000 egs/fisher_callhome_spanish/s5_gigaword/utils diff --git a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh deleted file mode 100755 index db97f1fbc6f..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh +++ /dev/null @@ -1,15 +0,0 @@ -# you can change cmd.sh depending on what type of queue you are using. -# If you have no queueing system and want to run on a local machine, you -# can change all instances 'queue.pl' to run.pl (but be careful and run -# commands one by one: most recipes will exhaust the memory on your -# machine). queue.pl works with GridEngine (qsub). slurm.pl works -# with slurm. Different queues are configured differently, with different -# queue names and different ways of specifying things like memory; -# to account for these differences you can create and edit the file -# conf/queue.conf to match your queue's configuration. Search for -# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, -# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. - -export train_cmd="retry.pl queue.pl --mem 8G" -export decode_cmd="retry.pl queue.pl --mem 8G" -export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config b/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config deleted file mode 100644 index 7908f178373..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config +++ /dev/null @@ -1,6 +0,0 @@ -# Use wider-than-normal decoding beams. -first_beam=16.0 -beam=20.0 -lat_beam=10.0 -min_lmwt=2 -max_lmwt=10 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf deleted file mode 100644 index ffb41a1aae4..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf +++ /dev/null @@ -1,2 +0,0 @@ ---use-energy=false # only non-default option. ---sample-frequency=8000 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf deleted file mode 100644 index d870ab04c38..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf +++ /dev/null @@ -1,10 +0,0 @@ -# config for high-resolution MFCC features, intended for neural network training. -# Note: we keep all cepstra, so it has the same info as filterbank features, -# but MFCC is more easily compressible (because less correlated) which is why -# we prefer this method. ---use-energy=false # use average of log energy, not energy. ---sample-frequency=8000 # Switchboard is sampled at 8kHz ---num-mel-bins=40 # similar to Google's setup. ---num-ceps=40 # there is no dimensionality reduction. ---low-freq=40 # low cutoff frequency for mel bins ---high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf deleted file mode 100644 index 7748a4a4dd3..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf +++ /dev/null @@ -1 +0,0 @@ -# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf deleted file mode 100644 index c4b73674cab..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf +++ /dev/null @@ -1,2 +0,0 @@ -# No non-default options for now. - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh deleted file mode 100755 index 07814da46a9..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -data_dir=data -train_all=data/callhome_train_all - -if [ $# -lt 1 ]; then - echo "Specify the location of the split files" - exit 1; -fi - -splitFile=$1 - -# Train first -for split in train dev test -do - dirName=callhome_$split - - cp -r $train_all $data_dir/$dirName - - awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \ - $splitFile/$split $train_all/segments > $data_dir/$dirName/segments - - n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $data_dir/$dirName/segments | sort | uniq | wc -l` - - echo "$n conversations left in split $dirName" - - utils/fix_data_dir.sh $data_dir/$dirName - utils/validate_data_dir.sh $data_dir/$dirName -done - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh deleted file mode 100755 index f61b0fa9519..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh +++ /dev/null @@ -1,163 +0,0 @@ -#!/bin/bash -# -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# The input is the Callhome Spanish Dataset. (*.sph files) -# In addition the transcripts are needed as well. -# To be run from one directory above this script. - -# Note: when creating your own data preparation scripts, it's a good idea -# to make sure that the speaker id (if present) is a prefix of the utterance -# id, that the output scp file is sorted on utterance id, and that the -# transcription file is exactly the same length as the scp file and is also -# sorted on utterance id (missing transcriptions should be removed from the -# scp file using e.g. scripts/filter_scp.pl) - -stage=0 - -export LC_ALL=C - - -if [ $# -lt 2 ]; then - echo "Arguments should be the location of the Callhome Spanish Speech and Transcript Directories, se -e ../run.sh for example." - exit 1; -fi - -cdir=`pwd` -dir=`pwd`/data/local/data -local=`pwd`/local -utils=`pwd`/utils -tmpdir=`pwd`/data/local/tmp - -. ./path.sh || exit 1; # Needed for KALDI_ROOT -export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -if [ ! -x $sph2pipe ]; then - echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; - exit 1; -fi -cd $dir - -# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command -# line arguments being absolute pathnames. -#rm -r links/ 2>/dev/null -mkdir -p links/ -ln -s $* links - -# Basic spot checks to see if we got the data that we needed -if [ ! -d links/LDC96S35 -o ! -d links/LDC96T17 ]; -then - echo "The speech and the data directories need to be named LDC96S35 and LDC96T17 respecti -vely" - exit 1; -fi - -if [ ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN ]; -then - echo "Dev, Eval or Train directories missing or not properly organised within the speech data dir" - exit 1; -fi - -#Check the transcripts directories as well to see if they exist -if [ ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/train ] -then - echo "Transcript directories missing or not properly organised" - exit 1; -fi - -speech_train=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN -speech_dev=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST -speech_test=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST -transcripts_train=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/train -transcripts_dev=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest -transcripts_test=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest - -fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l` -fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l` -fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l` -fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l` -fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l` -fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l` - -#Now check if we got all the files that we needed -if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ]; -then - echo "Incorrect number of files in the data directories" - echo "The paritions should contain 80/20/20 files" - exit 1; -fi - -if [ $stage -le 0 ]; then - #Gather all the speech files together to create a file list - ( - find $speech_train -iname '*.sph'; - find $speech_dev -iname '*.sph'; - find $speech_test -iname '*.sph'; - ) > $tmpdir/callhome_train_sph.flist - - #Get all the transcripts in one place - - ( - find $transcripts_train -iname '*.txt'; - find $transcripts_dev -iname '*.txt'; - find $transcripts_test -iname '*.txt'; - ) > $tmpdir/callhome_train_transcripts.flist - -fi - -if [ $stage -le 1 ]; then - $local/callhome_make_trans.pl $tmpdir - mkdir -p $dir/callhome_train_all - mv $tmpdir/callhome_reco2file_and_channel $dir/callhome_train_all/ -fi - -if [ $stage -le 2 ]; then - sort $tmpdir/callhome.text.1 | sed 's/^\s\s*|\s\s*$//g' | sed 's/\s\s*/ /g' > $dir/callhome_train_all/callhome.text - - #Create segments file and utt2spk file - ! cat $dir/callhome_train_all/callhome.text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/callhome_train_all/callhome_utt2spk \ - && echo "Error producing utt2spk file" && exit 1; - - cat $dir/callhome_train_all/callhome.text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; - $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' >$dir/callhome_train_all/callhome_segments - - $utils/utt2spk_to_spk2utt.pl <$dir/callhome_train_all/callhome_utt2spk > $dir/callhome_train_all/callhome_spk2utt -fi - -if [ $stage -le 3 ]; then - for f in `cat $tmpdir/callhome_train_sph.flist`; do - # convert to absolute path - make_absolute.sh $f - done > $tmpdir/callhome_train_sph_abs.flist - - cat $tmpdir/callhome_train_sph_abs.flist | perl -ane 'm:/([^/]+)\.SPH$: || die "bad line $_; "; print lc($1)," $_"; ' > $tmpdir/callhome_sph.scp - cat $tmpdir/callhome_sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \ - sort -k1,1 -u > $dir/callhome_train_all/callhome_wav.scp || exit 1; -fi - -if [ $stage -le 4 ]; then - # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl. - cd $cdir - #TODO: needs to be rewritten - $local/callhome_make_spk2gender.sh > $dir/callhome_train_all/callhome_spk2gender -fi - -# Rename files from the callhome directory -if [ $stage -le 5 ]; then - cd $dir/callhome_train_all - mv callhome.text text - mv callhome_segments segments - mv callhome_spk2utt spk2utt - mv callhome_wav.scp wav.scp - mv callhome_reco2file_and_channel reco2file_and_channel - mv callhome_spk2gender spk2gender - mv callhome_utt2spk utt2spk - cd $cdir -fi - -fix_data_dir.sh $dir/callhome_train_all || exit 1 -utils/validate_data_dir.sh --no-feats $dir/callhome_train_all || exit 1 - -echo "CALLHOME spanish Data preparation succeeded." - -exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py deleted file mode 100755 index a81818c2858..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# Extracts one best output for a set of files -# The list of files in the conversations for which 1 best output has to be extracted -# words.txt - -import os -import sys - -def findTranscription(timeDetail): - file1 = open('exp/tri5a/decode_callhome_dev/scoring/13.tra') - file2 = open('exp/tri5a/decode_callhome_train/scoring/13.tra') - for line in file1: - lineComp = line.split() - if lineComp[0] == timeDetail: - return " ".join(lineComp[1:]) - for line in file2: - lineComp = line.split() - if lineComp[0] == timeDetail: - return " ".join(lineComp[1:]) - # No result found - return -1 - - -wordsFile = open('exp/tri5a/graph/words.txt') -words = {} - -# Extract word list -for line in wordsFile: - lineComp = line.split() - words[int(lineComp[1])] = lineComp[0].strip() - -# Now read list of files in conversations -fileList = [] -#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev') -conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/train') -for line in conversationList: - line = line.strip() - line = line[:-4] - fileList.append(line) - -# IN what order were the conversations added to the spanish files? -# TODO: Make sure they match the order in which these english files are being written - -# Now get timing information to concatenate the ASR outputs -if not os.path.exists('exp/tri5a/one-best/ch_train'): - os.makedirs('exp/tri5a/one-best/ch_train') - -#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+') -provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/asr.train', 'w+') -for item in fileList: - timingFile = open('/export/a04/gkumar/corpora/fishcall/callhome/tim/' + item + '.es') - newFile = open('exp/tri5a/one-best/ch_train/' + item + '.es', 'w+') - for line in timingFile: - timeInfo = line.split() - mergedTranslation = "" - for timeDetail in timeInfo: - #Locate this in ASR dev/test, this is going to be very slow - tmp = findTranscription(timeDetail) - if tmp != -1: - mergedTranslation = mergedTranslation + " " + tmp - mergedTranslation = mergedTranslation.strip() - transWords = [words[int(x)] for x in mergedTranslation.split()] - newFile.write(" ".join(transWords) + "\n") - provFile.write(" ".join(transWords) + "\n") - - newFile.close() -provFile.close() - - - - - - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py deleted file mode 100755 index 4c96e01ce7e..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# Extracts one best output for a set of files -# The list of files in the conversations for which 1 best output has to be extracted -# words.txt - -from __future__ import print_function -import os -import sys -import subprocess - -latticeLocation = 'latjosh-2-callhome/lattices-pushed/' - -tmpdir = 'data/local/data/tmp/ch-d/lattmp' -invalidplfdir = 'data/local/data/tmp/ch-d/invalidplf' -symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt' - -conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/dev') -provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/asr.test.plf', 'w+') -invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/invalidPLF', 'w+') -blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/blankPLF', 'w+') -rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/removeLines', 'w+') - -if not os.path.exists(tmpdir): - os.makedirs(tmpdir) -if not os.path.exists(invalidplfdir): - os.makedirs(invalidplfdir) -else: - os.system("rm " + invalidplfdir + "/*") - -def latticeConcatenate(lat1, lat2): - ''' - Concatenates lattices, writes temporary results to tmpdir - ''' - if lat1 == "": - os.system('rm ' + tmpdir + '/tmp.lat') - return lat2 - else: - proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')]) - proc.wait() - return tmpdir + '/tmp.lat' - - -def findLattice(timeDetail): - ''' - Finds the lattice corresponding to a time segment - ''' - if os.path.isfile(latticeLocation + timeDetail + '.lat'): - return latticeLocation + timeDetail + '.lat' - else: - return -1 - - -# Now read list of files in conversations -fileList = [] -for line in conversationList: - line = line.strip() - line = line[:-4] - fileList.append(line) - -# IN what order were the conversations added to the spanish files? -# Now get timing information to concatenate the ASR outputs - -lineNo = 1 -for item in fileList: - timingFile = open('/export/a04/gkumar/corpora/fishcall/callhome/tim/' + item + '.es') - for line in timingFile: - timeInfo = line.split() - - # For utterances that are concatenated in the translation file, - # the corresponding FSTs have to be translated as well - mergedTranslation = "" - for timeDetail in timeInfo: - tmp = findLattice(timeDetail) - if tmp != -1: - # Concatenate lattices - mergedTranslation = latticeConcatenate(mergedTranslation, tmp) - - print(mergedTranslation) - if mergedTranslation != "": - - # Sanjeev's Recipe : Remove epsilons and topo sort - finalFST = tmpdir + "/final.fst" - os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST) - - # Now convert to PLF - proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable + ' ' + finalFST, stdout=subprocess.PIPE, shell=True) - PLFline = proc.stdout.readline() - finalPLFFile = tmpdir + "/final.plf" - finalPLF = open(finalPLFFile, "w+") - finalPLF.write(PLFline) - finalPLF.close() - - # now check if this is a valid PLF, if not write it's ID in a - # file so it can be checked later - proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) - line = proc.stdout.readline() - print("{} {}".format(line, lineNo)) - if line.strip() != "PLF format appears to be correct.": - os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) - invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") - rmLines.write("{}\n".format(lineNo)) - else: - provFile.write(PLFline) - else: - blankPLF.write(timeInfo[0] + "\n") - rmLines.write("{}\n".format(lineNo)) - # Now convert to PLF - lineNo += 1 - -provFile.close() -invalidPLF.close() -blankPLF.close() -rmLines.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh deleted file mode 100755 index d06e5fe911f..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# Gets the unique speakers from the file created by fsp_make_trans.pl -# Note that if a speaker appears multiple times, it is categorized as female - -import os -import sys - -tmpFileLocation = 'data/local/tmp/callhome_spk2gendertmp' - -tmpFile = None - -try: - tmpFile = open(tmpFileLocation) -except IOError: - print 'The file spk2gendertmp does not exist. Run fsp_make_trans.pl first?' - -speakers = {} - -for line in tmpFile: - comp = line.split(' ') - if comp[0] in speakers: - speakers[comp[0]] = "f" - else: - speakers[comp[0]] = comp[1] - -for speaker, gender in speakers.iteritems(): - print speaker + " " + gender diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl deleted file mode 100755 index ec3dfd88037..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -use utf8; -use File::Basename; - -($tmpdir)=@ARGV; -$trans="$tmpdir/callhome_train_transcripts.flist"; -$reco="$tmpdir/callhome_reco2file_and_channel"; -open(T, "<", "$trans") || die "Can't open transcripts file"; -open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!"; -open(O, ">$tmpdir/callhome.text.1") || die "Can't open text file for writing"; -open(G, ">$tmpdir/callhome_spk2gendertmp") || die "Can't open the speaker to gender map file"; -binmode(O, ":utf8"); -while () { - $file = $_; - m:([^/]+)\.txt: || die "Bad filename $_"; - $call_id = $1; - print R "$call_id-A $call_id A\n"; - print R "$call_id-B $call_id B\n"; - open(I, "<$file") || die "Opening file $_"; - binmode(I, ":iso88591"); - #Now read each line and extract information - while () { - #136.37 138.10 B: Ah, bueno, mamita. - chomp; - - my @stringComponents = split(":", $_, 2); - my @timeInfo = split(" ", $stringComponents[0]); - $stringComponents[1] =~ s/^\s+|\s+$//g ; - my $words = $stringComponents[1]; - #Check number of components in this array - if ((scalar @stringComponents) >= 2) { - $start = sprintf("%06d", $timeInfo[0] * 100); - $end = sprintf("%06d", $timeInfo[1] * 100); - length($end) > 6 && die "Time too long $end in $file"; - $side = "A"; - if (index($timeInfo[2], "B") != -1) { - $side = "B"; - } - $utt_id = "${call_id}-$side-$start-$end"; - $speaker_id = "${call_id}-$side"; - # All speakers are treated as male because speaker gender info - # is missing in this file - $gender = "m"; - print G "$speaker_id $gender\n" || die "Error writing to speaker2gender file"; - $words =~ s|\[\[[^]]*\]\]||g; #removes comments - $words =~ s|\{laugh\}|\$laughter\$|g; # replaces laughter tmp - $words =~ s|\[laugh\]|\$laughter\$|g; # replaces laughter tmp - $words =~ s|\{[^}]*\}|\[noise\]|g; # replaces noise - $words =~ s|\[[^]]*\]|\[noise\]|g; # replaces noise - $words =~ s|\[/*([^]]*)\]|\[noise\]|g; # replaces end of noise - $words =~ s|\$laughter\$|\[laughter\]|g; # replaces laughter again - $words =~ s|\(\(([^)]*)\)\)|\1|g; # replaces unintelligible speech - $words =~ s|<\?([^>]*)>|\1|g; # for unrecognized language - $words =~ s|background speech|\[noise\]|g; - $words =~ s|background noise|\[noise\]|g; - $words =~ s/\[/larrow/g; - $words =~ s/\]/rarrow/g; - $words =~ s/[[:punct:]]//g; - $words =~ s/larrow/\[/g; - $words =~ s/rarrow/\]/g; - $words =~ s/[¿¡]//g; - $words =~ s/\h+/ /g; # horizontal whitespace characters - $words = lc($words); - print O "$utt_id $words\n" || die "Error writing to text file"; - } - } - close(I); -} -close(T); -close(R); -close(O); -close(G); diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh deleted file mode 100755 index 37e1eca1687..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -if [ $# -gt 0 ]; then - sentence=$1 - echo $sentence | sed 's:{^[}]*}:[noise]:' -fi - - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh deleted file mode 100755 index 2f478419a18..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh +++ /dev/null @@ -1,294 +0,0 @@ -#!/bin/bash - -# 1g is like 1f but upgrading to a "resnet-style TDNN-F model", i.e. -# with bypass resnet connections, and re-tuned. -# compute-wer --text --mode=present ark:exp/chain/multipsplice_tdnn/decode_fsp_train_test/scoring_kaldi/test_filt.txt ark,p:- -# %WER 22.21 [ 8847 / 39831, 1965 ins, 2127 del, 4755 sub ] -# %SER 56.98 [ 3577 / 6278 ] -# Scored 6278 sentences, 0 not present in hyp. - -# steps/info/chain_dir_info.pl exp/chain/multipsplice_tdnn -# exp/chain/multipsplice_tdnn: num-iters=296 nj=1..2 num-params=8.2M dim=40+100->2489 combine=-0.170->-0.165 (over 8) xent:train/valid[196,295,final]=(-2.30,-1.93,-1.83/-2.24,-1.96,-1.86) logprob:train/valid[196,295,final]=(-0.208,-0.169,-0.164/-0.189,-0.161,-0.158) - -set -e -o pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -nj=30 -train_set=train -test_sets="test dev" -gmm=tri5a # this is the source gmm-dir that we'll use for alignments; it - # should have alignments for the specified training data. -num_threads_ubm=32 -nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. - -# Options which are not passed through to run_ivector_common.sh -affix=1g #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -common_egs_dir= -reporting_email= -gigaword_workdir= - -# LSTM/chain options -train_stage=-20 -xent_regularize=0.1 -dropout_schedule='0,0@0.20,0.3@0.50,0' - -# training chunk-options -chunk_width=140,100,160 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 - -# training options -srand=0 -remove_egs=true - -#decode options -test_online_decoding=false # if true, it will run the last decoding stage. - -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <$lang/topo - fi -fi - -if [ $stage -le 17 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 18 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. The num-leaves is always somewhat less than the num-leaves from - # the GMM baseline. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ - $lang $ali_dir $tree_dir -fi - - -if [ $stage -le 19 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" - tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" - linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" - prefinal_opts="l2-regularize=0.01" - output_opts="l2-regularize=0.005" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024 - tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 - tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 - tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 - tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 - tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 - linear-component name=prefinal-l dim=192 $linear_opts - - - prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 - output-layer name=output include-log-softmax=false dim=$num_targets $output_opts - - prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 20 ]; then - if [[ $(hostname -f) == *.clsp.joujhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage=$train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir $train_ivector_dir \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.0 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.dropout-schedule $dropout_schedule \ - --trainer.srand $srand \ - --trainer.max-param-change 2.0 \ - --trainer.num-epochs 4 \ - --trainer.frames-per-iter 5000000 \ - --trainer.optimization.num-jobs-initial 1 \ - --trainer.optimization.num-jobs-final=2 \ - --trainer.optimization.initial-effective-lrate 0.0005 \ - --trainer.optimization.final-effective-lrate 0.00005 \ - --trainer.num-chunk-per-minibatch 128,64 \ - --trainer.optimization.momentum 0.0 \ - --egs.chunk-width $chunk_width \ - --egs.chunk-left-context 0 \ - --egs.chunk-right-context 0 \ - --egs.dir "$common_egs_dir" \ - --egs.opts "--frames-overlap-per-eg 0" \ - --cleanup.remove-egs $remove_egs \ - --use-gpu true \ - --feat-dir $train_data_dir \ - --tree-dir $tree_dir \ - --lat-dir exp/tri5a_lats_nodup_sp \ - --dir $dir || exit 1; -fi - -if [ $stage -le 21 ]; then - # The reason we are using data/lang_test here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - #LM was trained only on Fisher Spanish train subset. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test \ - $tree_dir $tree_dir/graph_fsp_train || exit 1; - -fi - -# Let's train first a small RNNLM on Fisher train set -rnnlmdir=exp/rnnlm_lstm_tdnn_1b -if [ $stage -le 22 ]; then - rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; -fi - -if [ $stage -le 23 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - rm $dir/.error 2>/dev/null || true - - for data in $test_sets; do - ( - nspk=$(wc -l ") - print(" Processes the text before text normalisation to convert uppercase words as space separated letters") - sys.exit() - -inputfile=codecs.open(sys.argv[1], encoding='utf-8') -outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w') - -for line in inputfile: - words = line.split() - textout = "" - wordcnt = 0 - for word in words: - if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word): - if wordcnt > 0: - word = re.sub('\'?s', 's', word) - textout = textout + " ".join(word) + " " - else: - textout = textout + word + " " - else: - textout = textout + word + " " - if word.isalpha(): wordcnt = wordcnt + 1 - outputfile.write(textout.strip()+ '\n') - -inputfile.close() -outputfile.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh deleted file mode 100755 index 1880b3a90cb..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash - -# Script to clean up gigaword LM text -# Removes punctuations, does case normalization - -stage=0 -nj=500 - -. ./path.sh -. ./cmd.sh -. ./utils/parse_options.sh - -set -euo pipefail - -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1; -fi - -if [ ! -s `which normalizer_main` ] ; then - echo "Sparrowhawk normalizer was not found installed !" - echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!" - exit 1 -fi - -txtdir=$1 -textdir=$(realpath $txtdir) -outdir=$(realpath $2) - -workdir=$outdir/tmp -if [ $stage -le 0 ]; then - rm -rf $outdir - mkdir -p $workdir - mkdir -p $textdir/splits - mkdir -p $outdir/data - split -l 1000000 $textdir/in.txt $textdir/splits/out - numsplits=0 - for x in $textdir/splits/*; do - numsplits=$((numsplits+1)) - ln -s $x $outdir/data/$numsplits - done - echo $numsplits - cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt . - $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \ - local/run_norm.sh \ - sparrowhawk_configuration.ascii_proto \ - $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \ - $outdir/data \ - JOB \ - $outdir/sparrowhawk/ - cat $outdir/sparrowhawk/*.txt | sed "/^$/d" > $outdir/text_normalized - - # check if numbers are there in normalized output - awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \ - $outdir/text_normalized > $outdir/unique_words - grep "[0-9]" $outdir/unique_words | sort -u > $outdir/numbers -fi diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh deleted file mode 100755 index d48a96db5c4..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -# No sanity checks here, they need to be added - -data=data/callhome_test -dir=exp/tri5a/decode_callhome_test -lang=data/lang -LMWT=13 - -[ -f ./path.sh ] && . ./path.sh - -cmd=run.pl -filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel" -name=`basename $data`; -model=$dir/../final.mdl # assume model one level up from decoding dir. -symTable=$lang/words.txt - -if [ ! -f $dir/oracle/oracle.lat.gz ]; then - cat $data/text | utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \ - lattice-oracle --write-lattices="ark:|gzip -c > $dir/oracle/oracle.lat.gz" \ - "ark:gunzip -c $dir/lat.*.gz|" ark:- ark:- > /dev/null 2>&1 -fi - -lattice-align-words $lang/phones/word_boundary.int $model \ - "ark:gunzip -c $dir/oracle/oracle.lat.gz|" ark:- | \ - lattice-1best --lm-scale=$LMWT ark:- ark:- | nbest-to-ctm ark:- - | \ - utils/int2sym.pl -f 5 $lang/words.txt | \ - utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ - > $dir/oracle/$name.ctm diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh deleted file mode 100755 index 8a60dc9d422..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -data_dir=data -train_all=data/train_all - -if [ $# -lt 1 ]; then - echo "Specify the location of the split files" - exit 1; -fi - -splitFile=$1 - -# Train first -for split in train dev test dev2 -do - - cp -r $train_all $data_dir/$split - - awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \ - $splitFile/$split $train_all/segments > $data_dir/$split/segments - - n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $data_dir/$split/segments | sort | uniq | wc -l` - - echo "$n conversations left in split $split" - - utils/fix_data_dir.sh $data_dir/$split - utils/validate_data_dir.sh $data_dir/$split -done - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh deleted file mode 100755 index 7d09f574580..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -. ./cmd.sh - -split=test -data_dir=data/test -decode_dir=exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it4/ -lang_dir=data/lang - -# Create the STM file -# Always create this file before creating the CTM files so that -# channel numbers are properly created. -if [ ! -f $data_dir/stm ]; then - /export/a11/guoguo/babel/103-bengali-limitedLP.official/local/prepare_stm.pl $data_dir -fi - -# Create the CTM file -steps/get_ctm.sh $data_dir $lang_dir $decode_dir - -# Make sure that channel markers match -#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm -#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {} -#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {} - -# Get the environment variables -. /export/babel/data/software/env.sh - -# Start scoring -/export/a11/guoguo/babel/103-bengali-limitedLP.official/local/score_stm.sh $data_dir $lang_dir \ - $decode_dir - -# Print a summary of the result -grep "Percent Total Error" $decode_dir/score_*/$split.ctm.dtl diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py b/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py deleted file mode 100755 index 6f3d3f80c95..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env python - -# Author : Gaurav Kumar (Johns Hopkins University) -# Gets a report on what the best word error rate was and which iteration -# led to it. This is needed both for reporting purposes and for setting -# the acoustic scale weight which extracting lattices. -# This script is specific to my partitions and needs to be made more general -# or modified - -from __future__ import print_function -import subprocess -import os - -decode_directories = ['exp/tri5a/decode_dev', - 'exp/tri5a/decode_test', - 'exp/tri5a/decode_dev2', - 'exp/sgmm2x_6a/decode_dev_fmllr', - 'exp/sgmm2x_6a/decode_test_fmllr', - 'exp/sgmm2x_6a/decode_dev2_fmllr', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it1', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it2', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it3', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it4', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it1', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it2', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it3', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it4', - 'exp/sgmm2x_6a_mmi_b0.2/decode_test_it1', - 'exp/sgmm2x_6a_mmi_b0.2/decode_test_it2', - 'exp/sgmm2x_6a_mmi_b0.2/decode_test_it3', - 'exp/sgmm2x_6a_mmi_b0.2/decode_test_it4', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it1', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it2', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it3', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it4', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it1', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it2', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it3', - 'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it4', - 'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it1', - 'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it2', - 'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it3', - 'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it4' - ] - -def get_best_wer(decode_dir): - best_iteration = 0 - best_wer = 100.0 - for i in range(16): - if os.path.isfile("{}/wer_{}".format(decode_dir, i)): - result = subprocess.check_output("tail -n 3 {}/wer_{}".format(decode_dir, i), shell=True) - wer_string = result.split("\n")[0] - wer_details = wer_string.split(' ') - # Get max WER - wer = float(wer_details[1]) - if wer < best_wer: - best_wer = wer - best_iteration = i - return best_iteration, best_wer - -for decode_dir in decode_directories[:6]: - print(decode_dir) - print(get_best_wer(decode_dir)) - -# Separate processing for bMMI stuff -best_wer = 100.0 -best_dir = "" -best_iteration = 0 - -for decode_dir in decode_directories[6:10]: - iteration, wer = get_best_wer(decode_dir) - if wer < best_wer: - best_wer = wer - best_dir = decode_dir - best_iteration = iteration - -print(best_dir) -print((best_iteration, best_wer)) - -best_wer = 100.0 -best_dir = "" -best_iteration = 0 - -for decode_dir in decode_directories[10:14]: - iteration, wer = get_best_wer(decode_dir) - if wer < best_wer: - best_wer = wer - best_dir = decode_dir - best_iteration = iteration - -print(best_dir) -print((best_iteration, best_wer)) - -best_wer = 100.0 -best_dir = "" -best_iteration = 0 - -for decode_dir in decode_directories[14:18]: - iteration, wer = get_best_wer(decode_dir) - if wer < best_wer: - best_wer = wer - best_dir = decode_dir - best_iteration = iteration - -print(best_dir) -print((best_iteration, best_wer)) - -best_wer = 100.0 -best_dir = "" -best_iteration = 0 - -for decode_dir in decode_directories[18:22]: - iteration, wer = get_best_wer(decode_dir) - if wer <= best_wer: - best_wer = wer - best_dir = decode_dir - best_iteration = iteration - -print(best_dir) -print((best_iteration, best_wer)) - -best_wer = 100.0 -best_dir = "" -best_iteration = 0 - -for decode_dir in decode_directories[22:26]: - iteration, wer = get_best_wer(decode_dir) - if wer <= best_wer: - best_wer = wer - best_dir = decode_dir - best_iteration = iteration - -print(best_dir) -print((best_iteration, best_wer)) - -best_wer = 100.0 -best_dir = "" -best_iteration = 0 - -for decode_dir in decode_directories[26:]: - iteration, wer = get_best_wer(decode_dir) - if wer <= best_wer: - best_wer = wer - best_dir = decode_dir - best_iteration = iteration - -print(best_dir) -print((best_iteration, best_wer)) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl deleted file mode 100755 index 2da41182d20..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env perl -#Finds unique phones from the basic rules file -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -use utf8; - -($b)=$ARGV[0]; -($tmpdir)=$ARGV[1]; -open(BB, "<", "$b/basic_rules") || die "Can't open basic rules"; -binmode(BB, ":iso88591"); -open(O, ">$tmpdir/phones") || die "Can't open text file for writing"; -binmode(O, ":utf8"); -my %phones = qw(); -while () { - chomp; - my @stringComponents = split(/\t/); - m/->\s(\S+)/; - my $phone = $1; - $phone =~ tr/áéíóú/aeiou/; - $phones{$phone} = 1; -} -foreach my $p (keys %phones) { - print O $p, "\n"; -} -#print keys %phones; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh deleted file mode 100755 index 20220d107bc..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash - -# Fixes the CALLHOME stm files -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -data_dir=$1 - -cat $data_dir/stm | awk '{$1=substr(tolower($1),0,length($1)-4);print;}' > $data_dir/stm_new -mv $data_dir/stm $data_dir/stm.bak -mv $data_dir/stm_new $data_dir/stm diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh deleted file mode 100755 index 242359e7c28..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash -set -e - -# Path to Gigaword corpus with all data files decompressed. -export GIGAWORDDIR=$1 -# The directory to write output to -export OUTPUTDIR=$2 -# The number of jobs to run at once -export NUMJOBS=$3 - -echo "Flattening Gigaword with ${NUMJOBS} processes..." -mkdir -p $OUTPUTDIR -find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \; -echo "Combining the flattened files into one..." -cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py deleted file mode 100644 index 29f6766dd84..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py +++ /dev/null @@ -1,61 +0,0 @@ -# -*- coding: utf-8 -*- - -import logging -import os -import re -import spacy -import gzip - -from argparse import ArgumentParser -from bs4 import BeautifulSoup - -en_nlp = spacy.load("es") - - -def flatten_one_gigaword_file(file_path): - f = gzip.open(file_path) - html = f.read() - # Parse the text with BeautifulSoup - soup = BeautifulSoup(html, "html.parser") - - # Iterate over all

items and get the text for each. - all_paragraphs = [] - for paragraph in soup("p"): - # Turn inter-paragraph newlines into spaces - paragraph = paragraph.get_text() - paragraph = re.sub(r"\n+", "\n", paragraph) - paragraph = paragraph.replace("\n", " ") - # Tokenize the paragraph into words - tokens = en_nlp.tokenizer(paragraph) - words = [str(token) for token in tokens if not - str(token).isspace()] - if len(words) < 3: - continue - all_paragraphs.append(words) - # Return a list of strings, where each string is a - # space-tokenized paragraph. - return [" ".join(paragraph) for paragraph in all_paragraphs] - - -if __name__ == "__main__": - log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - logging.basicConfig(level=logging.INFO, format=log_fmt) - logger = logging.getLogger(__name__) - - parser = ArgumentParser(description=("Flatten a gigaword data file for " - "use in language modeling.")) - parser.add_argument("--gigaword-path", required=True, - metavar="", type=str, - help=("Path to Gigaword directory, with " - "all .gz files unzipped.")) - parser.add_argument("--output-dir", required=True, metavar="", - type=str, help=("Directory to write final flattened " - "Gigaword file.")) - - A = parser.parse_args() - all_paragraphs = flatten_one_gigaword_file(A.gigaword_path) - output_path = os.path.join(A.output_dir, - os.path.basename(A.gigaword_path) + ".flat") - with open(output_path, "w") as output_file: - for paragraph in all_paragraphs: - output_file.write("{}\n".format(paragraph)) diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh deleted file mode 100755 index 6b236be0ab9..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash -set -e - -. ./path_venv.sh - -# Path to Gigaword corpus with all data files decompressed. -GIGAWORDPATH=$1 -# The directory to write output to -OUTPUTDIR=$2 -file=$(basename ${GIGAWORDPATH}) -if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then - echo "flattening to ${OUTPUTDIR}/${file}.flat" - python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR} -else - echo "skipping ${file}.flat" -fi - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh deleted file mode 100755 index fb765b57e69..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# - -if [ -f path.sh ]; then . ./path.sh; fi - -mkdir -p data/lang_test - -arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -mkdir -p data/lang_test -cp -r data/lang/* data/lang_test - -gunzip -c "$arpa_lm" | \ - arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst - -## Check lexicon. -## just have a look and make sure it seems sane. -echo "First few lines of lexicon FST:" -fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head - -echo Performing further checks - -# Checking that G.fst is determinizable. -fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. - -# Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. - -# Checking that disambiguated lexicon times G is determinizable -# Note: we do this with fstdeterminizestar not fstdeterminize, as -# fstdeterminize was taking forever (presumbaly relates to a bug -# in this version of OpenFst that makes determinization slow for -# some case). -fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ - fstdeterminizestar >/dev/null || echo Error - -# Checking that LG is stochastic: -fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ - fstisstochastic || echo "[log:] LG is not stochastic" - - -echo "$0 succeeded" diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh deleted file mode 100755 index 22b98a6c9db..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh +++ /dev/null @@ -1,176 +0,0 @@ -#!/bin/bash -# -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) -# In addition the transcripts are needed as well. -# To be run from one directory above this script. - -# Note: when creating your own data preparation scripts, it's a good idea -# to make sure that the speaker id (if present) is a prefix of the utterance -# id, that the output scp file is sorted on utterance id, and that the -# transcription file is exactly the same length as the scp file and is also -# sorted on utterance id (missing transcriptions should be removed from the -# scp file using e.g. scripts/filter_scp.pl) - -stage=0 - -export LC_ALL=C - - -if [ $# -lt 2 ]; then - echo "Usage: $0 " - echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04" - exit 1; -fi - -cdir=`pwd` -dir=`pwd`/data/local/data -lmdir=`pwd`/data/local/nist_lm -mkdir -p $dir $lmdir -local=`pwd`/local -utils=`pwd`/utils -tmpdir=`pwd`/data/local/tmp -mkdir -p $tmpdir - -. ./path.sh || exit 1; # Needed for KALDI_ROOT -export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -if [ ! -x $sph2pipe ]; then - echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; - exit 1; -fi -cd $dir - -# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command -# line arguments being absolute pathnames. -rm -r links/ 2>/dev/null -mkdir links/ -ln -s $* links - -# Basic spot checks to see if we got the data that we needed -if [ ! -d links/LDC2010S01 -o ! -d links/LDC2010T04 ]; -then - echo "The speech and the data directories need to be named LDC2010S01 and LDC2010T04 respecti -vely" - exit 1; -fi - -#if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ]; -if [ ! -d links/LDC2010S01/data/speech ]; -then - echo "Speech directories missing or not properly organised within the speech data dir" - echo "Typical format is LDC2010S01/data/speech" - exit 1; -fi - -#Check the transcripts directories as well to see if they exist -if [ ! -d links/LDC2010T04/fisher_spa_tr/data/transcripts ]; -then - echo "Transcript directories missing or not properly organised" - echo "Typical format is LDC2010T04/fisher_spa_tr/data/transcripts" - exit 1; -fi - -#speech_d1=$dir/links/LDC2010S01/DISC1/data/speech -#speech_d2=$dir/links/LDC2010S01/DISC2/data/speech -speech=$dir/links/LDC2010S01/data/speech -transcripts=$dir/links/LDC2010T04/fisher_spa_tr/data/transcripts - -#fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` -#fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` -fcount_s=`find ${speech} -iname '*.sph' | wc -l` -fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` -#TODO:it seems like not all speech files have transcripts -#Now check if we got all the files that we needed -#if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; -if [ $fcount_s != 819 -o $fcount_t != 819 ]; -then - echo "Incorrect number of files in the data directories" - echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively (Total = 819)" - echo "The transcripts should contain 819 files" - exit 1; -fi - -if [ $stage -le 0 ]; then - #Gather all the speech files together to create a file list - #TODO: Train and test split might be required - ( - #find $speech_d1 -iname '*.sph'; - #find $speech_d2 -iname '*.sph'; - find $speech -iname '*.sph'; - ) > $tmpdir/train_sph.flist - - #Get all the transcripts in one place - find $transcripts -iname '*.tdf' > $tmpdir/train_transcripts.flist -fi - -if [ $stage -le 1 ]; then - $local/fsp_make_trans.pl $tmpdir - mkdir -p $dir/train_all - mv $tmpdir/reco2file_and_channel $dir/train_all/ -fi - -if [ $stage -le 2 ]; then - sort $tmpdir/text.1 | grep -v '((' | \ - awk '{if (NF > 1){ print; }}' | \ - sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \ - sed 's:\([^<]*\)<\/lname>:\1:g' | \ - sed 's:::g' | \ - sed 's:[^<]*<\/laugh>:[laughter]:g' | \ - sed 's:<\s*cough[\/]*>:[noise]:g' | \ - sed 's::[noise]:g' | \ - sed 's::[noise]:g' | \ - sed 's::[noise]:g' | \ - sed 's:[^<]*<\/background>:[noise]:g' | \ - sed -r 's:<[/]?background[/]?>:[noise]:g' | \ - #One more time to take care of nested stuff - sed 's:[^<]*<\/laugh>:[laughter]:g' | \ - sed -r 's:<[/]?laugh[/]?>:[laughter]:g' | \ - #now handle the exceptions, find a cleaner way to do this? - sed 's:::g' | \ - sed 's:::g' | \ - sed 's:foreign>::g' | \ - sed 's:\[noise\]:[noise] :g' | \ - sed 's:>::g' | \ - #How do you handle numbers? - grep -v '()' | \ - #Now go after the non-printable characters and multiple spaces - sed -r 's:¿::g' | sed 's/^\s\s*|\s\s*$//g' | sed 's/\s\s*/ /g' > $tmpdir/text.2 - cp $tmpdir/text.2 $dir/train_all/text - - #Create segments file and utt2spk file - ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \ - && echo "Error producing utt2spk file" && exit 1; - - cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; - $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); if ($s != $e) {print "$utt $reco $s $e\n"}; ' >$dir/train_all/segments - - $utils/utt2spk_to_spk2utt.pl <$dir/train_all/utt2spk > $dir/train_all/spk2utt -fi - -if [ $stage -le 3 ]; then - for f in `cat $tmpdir/train_sph.flist`; do - # convert to absolute path - make_absolute.sh $f - done > $tmpdir/train_sph_abs.flist - - cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp - cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \ - sort -k1,1 -u > $dir/train_all/wav.scp || exit 1; -fi - -if [ $stage -le 4 ]; then - # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl. - cd $cdir - $local/fsp_make_spk2gender.sh > $dir/train_all/spk2gender -fi - -fix_data_dir.sh $dir/train_all || exit 1 -validate_data_dir.sh --no-feats $dir/train_all || exit 1 - -echo "Fisher Spanish Data preparation succeeded." - -exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl deleted file mode 100755 index 538bca58981..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env perl -# -# Johns Hopkins University (Author : Gaurav Kumar) -# -# This script should be run from one directory above the current one -# -# Rough partitions that are needed are : -# -# ASR Train : 120k utterances -# ASR tune : 20k utterances -# ASR eval : 20k utterances -# MT train : 105k utterances -# MT tune : Same as the ASR eval (20k utterances) -# MT eval : 20k utterances -# -# This script tries to find the closest possible matches so that conversations -# belong in one single partition and hence there is no speaker/conversation -# overlap between data partitions - -use Storable 'dclone'; - -$textfile="data/local/data/train_all/text"; -$tmp="data/local/tmp"; - -open(T, "<", "$textfile") || die "Can't open text file"; - -$ongoingConv = ""; -%tmpSplits = (); -@splitNumbers = (17455, 20000, 100000, 20000, 100000); -$splitId = 0; -%splits = (); - -while () { - @myStringComponents = split(/\s/); - @uttid = split('-', $myStringComponents[0]); - $currentConv = $uttid[0]; - if ($currentConv eq $ongoingConv) { - # Same conversation, add to current hash - #print "Same conversation"; - $tmpSplits{$ongoingConv} += 1; - } - else { - # New conversation intiated, first check if there are enough entries - # in the hash - #print $ongoingConv . " " . get_entries_hash(\%tmpSplits) . "\n"; - if (get_entries_hash(\%tmpSplits) > $splitNumbers[$splitId]) { - print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n"; - #$splits{$splitId} = keys %tmpSplits; - @newArr = keys %tmpSplits; - $splits{$splitId} = dclone(\@newArr); - %tmpSplits = (); - $splitId += 1; - } - $ongoingConv = $currentConv; - $tmpSplits{$ongoingConv} = 1; - } -} -# Put final tmpsplits in the right partition -@newArr = keys %tmpSplits; -$splits{$splitId} = dclone(\@newArr); -foreach (keys %splits) { - #print $_ , " ", $splits{$_}, "\n"; -} -print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n"; - -# Write splits to file -foreach my $key ( keys %splits ) { - open(S, ">$tmp/split-$key") || die "Can't open splitfile to write"; - foreach my $file ( @{$splits{$key}} ) { - print $file, "\n"; - print S "$file\n" || die "Error writing to file"; - } - close(S); -} - -sub get_entries_hash() { - my $inputHashRef = shift; - $total = 0; - foreach (keys %{$inputHashRef}) - { - $total += $inputHashRef->{$_}; - } - return $total; -} - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh deleted file mode 100755 index 15b1c0064cf..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# Gets the unique speakers from the file created by fsp_make_trans.pl -# Note that if a speaker appears multiple times, it is categorized as female - -import os -import sys - -tmpFileLocation = 'data/local/tmp/spk2gendertmp' - -tmpFile = None - -try: - tmpFile = open(tmpFileLocation) -except IOError: - print 'The file spk2gendertmp does not exist. Run fsp_make_trans.pl first?' - -speakers = {} - -for line in tmpFile: - comp = line.split(' ') - if comp[0] in speakers: - speakers[comp[0]] = "f" - else: - speakers[comp[0]] = comp[1] - -for speaker, gender in speakers.iteritems(): - print speaker + " " + gender diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl deleted file mode 100755 index 8c3f74e3917..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -use utf8; -use File::Basename; -($tmpdir)=@ARGV; -#$tmpdir='../data/local/tmp'; -$trans="$tmpdir/train_transcripts.flist"; -$reco="$tmpdir/reco2file_and_channel"; -open(T, "<", "$trans") || die "Can't open transcripts file"; -open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!"; -open(O, ">$tmpdir/text.1") || die "Can't open text file for writing"; -open(G, ">$tmpdir/spk2gendertmp") || die "Can't open the speaker to gender map file"; -binmode(O, ":utf8"); -while () { - $file = $_; - m:([^/]+)\.tdf: || die "Bad filename $_"; - $call_id = $1; - print R "$call_id-A $call_id A\n"; - print R "$call_id-B $call_id B\n"; - open(I, "<$file") || die "Opening file $_"; - binmode(I, ":utf8"); - # Get rid of header sections first - foreach ( 0..2 ) { - $tmpLine = ; - } - #Now read each line and extract information - while () { - #20051017_215732_274_fsp.sph 1 0.0 0.909856781803 Audrey female native Audrey 0 0 -1 - chomp; - my @stringComponents = split(/\t/); - - #Check number of components in this array - if ((scalar @stringComponents) >= 11) { - $start = sprintf("%06d", $stringComponents[2] * 100); - $end = sprintf("%06d", $stringComponents[3] * 100); - length($end) > 6 && die "Time too long $end in $file"; - $side = $stringComponents[1] ? "B" : "A"; - $words = $stringComponents[7]; - $utt_id = "${call_id}-$side-$start-$end"; - $speaker_id = "${call_id}-$side"; - $gender = "m"; - if ($stringComponents[5] == "female") { - $gender = "f"; - } - print G "$speaker_id $gender\n" || die "Error writing to speaker2gender file"; - $words =~ s:/rarrow/g; - $words =~ s/[[:punct:]]//g; - $words =~ s/larrow//g; - $words =~ s:lendarrow: 0){ print; }}' > $tmpdir/uniquewords - if [ ! -f "${tmpdir}/es_wordlist.json" ]; then - echo "Could not find the large collection of Spanish words es_wordlist.json" - echo "Trying to download it via wget" - - if ! which wget >&/dev/null; then - echo "This script requires you to first install wget" - exit 1; - fi - - cwd=`pwd` - cd $tmpdir - wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz - - if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then - echo "Download of the large Spanish word list failed" - exit 1; - fi - - tar -xovzf es_wordlist.json.tgz || exit 1; - cd $cwd - fi - - # Merge with gigaword corpus - $local/merge_lexicons.py ${tmpdir} ${lexicon} - mv $tmpdir/uniquewords $tmpdir/uniquewords.small - mv $tmpdir/uniquewords64k $tmpdir/uniquewords -fi - -#Then get the list of phones form basic_rules in the lexicon folder -if [ $stage -le 1 ]; then - if [ ! -d "$lexicon/callhome_spanish_lexicon_970908" ]; then - echo "Could not find folder callhome_spanish_lexicon_970908 in the lexicon folder" - exit 1; - fi - - # This is a preliminary attempt to get the unique phones from the LDC lexicon - # This will be extended based on our lexicon later - perl $local/find_unique_phones.pl $lexicon/callhome_spanish_lexicon_970908 $tmpdir - -fi - -#Get pronunciation for each word using the spron.pl file in the lexicon folder -if [ $stage -le 2 ]; then - #cd $lexicon/callhome_spanish_lexicon_970908 - # Replace all words for which no pronunciation was generated with an orthographic - # representation - cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \ - | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \ - | awk -F '[/][/]' '{print $1}' \ - > $tmpdir/lexicon_raw -fi - -#Break the pronunciation down according to the format required by Kaldi -if [ $stage -le 3 ]; then - # Creates a KALDI compatible lexicon, and extends the phone list - perl $local/isolate_phones.pl $tmpdir - cat $tmpdir/phones_extended | sort | awk '{if ($1 != "") {print;}}' > $tmpdir/phones_extended.1 - mv $tmpdir/phones $tmpdir/phones.small - mv $tmpdir/phones_extended.1 $tmpdir/phones - sort $tmpdir/phones -o $tmpdir/phones - paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | sed -r 's:(\S+)\s#.*:\1 oov:g' > $tmpdir/lexicon.1 - #paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | grep -v '#' > $tmpdir/lexicon.1 -fi - -if [ $stage -le 4 ]; then - # silence phones, one per line. - for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt - echo sil > $dir/optional_silence.txt - - # An extra question will be added by including the silence phones in one class. - cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > \ - $dir/extra_questions.txt || exit 1; - - # Remove [] chars from phones - cat $tmpdir/phones | awk '{if ($1 != "_" && $1 != "[" && $1 != "]") {print;}}' > $tmpdir/phones.1 - rm $tmpdir/phones - mv $tmpdir/phones.1 $tmpdir/phones - cp $tmpdir/phones $dir/nonsilence_phones.txt - - if [ -f $tmpdir/lexicon.2 ]; then rm $tmpdir/lexicon.2; fi - cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2" - - # Add prons for laughter, noise, oov - for w in `grep -v sil $dir/silence_phones.txt`; do - sed -i "/\[$w\]/d" $tmpdir/lexicon.2 - done - - for w in `grep -v sil $dir/silence_phones.txt`; do - echo "[$w] $w" - done | cat - $tmpdir/lexicon.2 > $tmpdir/lexicon.3 || exit 1; - - cat $tmpdir/lexicon.3 \ - <( echo "mm m" - echo " oov" ) > $tmpdir/lexicon.4 - - # From the lexicon remove _ from the phonetic representation - cat $tmpdir/lexicon.4 | sed 's:\s_::g' > $tmpdir/lexicon.5 - - cp "$tmpdir/lexicon.5" $dir/lexicon.txt - - cat $datadir/text | \ - awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \ - sort -nr > $tmpdir/word_counts - - awk '{print $1}' $dir/lexicon.txt | \ - perl -e '($word_counts)=@ARGV; - open(W, "<$word_counts")||die "opening word-counts $word_counts"; - while() { chop; $seen{$_}=1; } - while() { - ($c,$w) = split; - if (!defined $seen{$w}) { print; } - } ' $tmpdir/word_counts > $tmpdir/oov_counts.txt - echo "*Highest-count OOVs are:" - head -n 20 $tmpdir/oov_counts.txt -fi - -$utils/validate_dict_dir.pl $dir -exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh deleted file mode 100755 index cebf3b222ab..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh +++ /dev/null @@ -1,140 +0,0 @@ -#!/bin/bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -# To be run from one level above this directory -# Generate the text for the LM training -tmp_dir=data/local/tmp -train_all=data/local/data/train_all - -if [ $# -lt 1 ]; then - echo "Specify the location of the split files" - exit 1; -fi - -splitFile=$1 -split=train -# Train only -if [ -d $tmp_dir/$split ]; then - rm -r $tmp_dir/$split -fi -cp -r $train_all $tmp_dir/$split - -awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \ -$splitFile/$split $train_all/segments > $tmp_dir/$split/segments - -n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $tmp_dir/$split/segments | sort | uniq | wc -l` - -echo "$n conversations left in split $split" - -utils/fix_data_dir.sh $tmp_dir/$split -# There is no feature file yet, use --no-feats switch -utils/validate_data_dir.sh --no-feats $tmp_dir/$split - -# Now use this training text - -text=$tmp_dir/train/text -lexicon=data/local/dict/lexicon.txt - -for f in "$text" "$lexicon"; do - [ ! -f $x ] && echo "$0: No such file $f" && exit 1; -done - -# This script takes no arguments. It assumes you have already run -# fisher_data_prep.sh and fisher_prepare_dict.sh -# It takes as input the files -#data/train_all/text -#data/local/dict/lexicon.txt - -dir=`pwd`/data/local/lm -mkdir -p $dir -export LC_ALL=C # You'll get errors about things being not sorted, if you -# have a different locale. -export PATH=$PATH:`pwd`/../../../tools/kaldi_lm -( # First make sure the kaldi_lm toolkit is installed. - cd ../../../tools || exit 1; - if [ -d kaldi_lm ]; then - echo Not installing the kaldi_lm toolkit since it is already there. - else - echo Downloading and installing the kaldi_lm tools - if [ ! -f kaldi_lm.tar.gz ]; then - wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; - fi - tar -xvzf kaldi_lm.tar.gz || exit 1; - cd kaldi_lm - make || exit 1; - echo Done making the kaldi_lm tools - fi -) || exit 1; - -mkdir -p $dir - - -cleantext=$dir/text.no_oov - -cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ - > $cleantext || exit 1; - - -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; - - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon (but not silence, we don't want it -# in the LM-- we'll add it optionally later). -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; - -# note: we probably won't really make use of as there aren't any OOVs -cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ - || exit 1; - -# note: ignore 1st field of train.txt, it's the utterance-id. -cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} - { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ - || exit 1; - -train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; - -# Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332 - -# note: output is -# data/local/lm/3gram-mincount/lm_unpruned.gz - - -exit 0 - -echo "Baseline" - -# From here is some commands to do a baseline with SRILM (assuming -# you have it installed). -heldout_sent=158126 # Don't change this if you want result to be comparable with - # kaldi_lm results -sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. -mkdir -p $sdir -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/heldout -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/train - -cat $dir/word_map | awk '{print $1}' | cat - <(echo ""; echo "" ) > $sdir/wordlist - - -ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ - -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz -ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout - -# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for in closed-vocabulary LM -# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs -# 0 zeroprobs, logprob= -165170 ppl= 71.7609 ppl1= 123.258 - - -# Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above. -# Difference in WSJ must have been due to different treatment of . -ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout - -# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for in closed-vocabulary LM -# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs -# 0 zeroprobs, logprob= -164990 ppl= 71.4278 ppl1= 122.614 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py deleted file mode 100755 index 9c590635562..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -# Extracts one best output for a set of files -# The list of files in the conversations for which 1 best output has to be extracted -# words.txt - -import os -import sys - -scoringFile = "exp/sgmm2x_6a_mmi_b0.2/decode_test_it4/scoring/10.tra" -wordsFile = open('exp/sgmm2x_6a/graph/words.txt') -conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/test') -oneBestTmp = 'exp/sgmm2x_6a_mmi_b0.2/one-best/asr-test' -provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.test', 'w+') -timLocation = '/export/a04/gkumar/corpora/fishcall/fisher/tim' - -def findTranscription(timeDetail): - file1 = open(scoringFile) - for line in file1: - lineComp = line.split() - if lineComp[0] == timeDetail: - return " ".join(lineComp[1:]) - # No result found - return -1 - -words = {} - -# Extract word list -for line in wordsFile: - lineComp = line.split() - words[int(lineComp[1])] = lineComp[0].strip() - -# Now read list of files in conversations -fileList = [] -for line in conversationList: - line = line.strip() - line = line[:-4] - fileList.append(line) - -# Now get timing information to concatenate the ASR outputs -if not os.path.exists(oneBestTmp): - os.makedirs(oneBestTmp) - -for item in fileList: - timingFile = open(timLocation + '/' + item + '.es') - newFile = open(oneBestTmp + '/' + item + '.es', 'w+') - for line in timingFile: - timeInfo = line.split() - mergedTranslation = "" - for timeDetail in timeInfo: - #Locate this in ASR dev/test, this is going to be very slow - tmp = findTranscription(timeDetail) - if tmp != -1: - mergedTranslation = mergedTranslation + " " + tmp - mergedTranslation = mergedTranslation.strip() - transWords = [words[int(x)] for x in mergedTranslation.split()] - newFile.write(" ".join(transWords) + "\n") - provFile.write(" ".join(transWords) + "\n") - - newFile.close() -provFile.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl deleted file mode 100755 index ca5b2a46f8e..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env perl - -# Nagendra Kumar Goel - -# This takes two arguments: -# 1) Pocolm training output folder -# 2) rnnlm weights file name (for output) - -use POSIX; -use List::Util qw[min max]; - -if (@ARGV != 2) { - die "Usage: get_data_weights.pl \n"; -} - -$pdir = shift @ARGV; -$out = shift @ARGV; - -open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters"; -open(N, "<$pdir/names") || die "Could not open $pdir/names" ; -open(O, ">$out") || die "Could not open $out for writing" ; - -my %scores = (); - -while() { - @n = split(/\s/,$_); - $name = $n[1]; - $w =

; - @w = split(/\s/,$w); - $weight = $w[1]; - $scores{$name} = $weight; -} - -$min = min(values %scores); - -for(keys %scores) { - $weightout = POSIX::ceil($scores{$_} / $min); - print O "$_\t1\t$weightout\n"; -} diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py deleted file mode 100755 index 5430c18bb5b..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -# Extracts one best output for a set of files -# The list of files in the conversations for which 1 best output has to be extracted -# words.txt - -from __future__ import print_function -import os -import sys -import subprocess - -latticeLocation = 'latjosh-bmmi/lattices-pushed/' - -tmpdir = 'data/local/data/tmp/bmmi-t/lattmp' -invalidplfdir = 'data/local/data/tmp/bmmi-t/invalidplf' -symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt' - -conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/test') -provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/asr.test.plf', 'w+') -invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/invalidPLF', 'w+') -blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/blankPLF', 'w+') -rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/removeLines', 'w+') - -if not os.path.exists(tmpdir): - os.makedirs(tmpdir) -if not os.path.exists(invalidplfdir): - os.makedirs(invalidplfdir) -else: - os.system("rm " + invalidplfdir + "/*") - -def latticeConcatenate(lat1, lat2): - ''' - Concatenates lattices, writes temporary results to tmpdir - ''' - if lat1 == "": - os.system('rm ' + tmpdir + '/tmp.lat') - return lat2 - else: - proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')]) - proc.wait() - return tmpdir + '/tmp.lat' - - -def findLattice(timeDetail): - ''' - Finds the lattice corresponding to a time segment - ''' - if os.path.isfile(latticeLocation + timeDetail + '.lat'): - return latticeLocation + timeDetail + '.lat' - else: - return -1 - - -# Now read list of files in conversations -fileList = [] -for line in conversationList: - line = line.strip() - line = line[:-4] - fileList.append(line) - -# IN what order were the conversations added to the spanish files? -# Now get timing information to concatenate the ASR outputs - -lineNo = 1 -for item in fileList: - timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es') - for line in timingFile: - timeInfo = line.split() - - # For utterances that are concatenated in the translation file, - # the corresponding FSTs have to be translated as well - mergedTranslation = "" - for timeDetail in timeInfo: - tmp = findLattice(timeDetail) - if tmp != -1: - # Concatenate lattices - mergedTranslation = latticeConcatenate(mergedTranslation, tmp) - - print(mergedTranslation) - if mergedTranslation != "": - - # Sanjeev's Recipe : Remove epsilons and topo sort - finalFST = tmpdir + "/final.fst" - os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST) - - # Now convert to PLF - proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable + ' ' + finalFST, stdout=subprocess.PIPE, shell=True) - PLFline = proc.stdout.readline() - finalPLFFile = tmpdir + "/final.plf" - finalPLF = open(finalPLFFile, "w+") - finalPLF.write(PLFline) - finalPLF.close() - - # now check if this is a valid PLF, if not write it's ID in a - # file so it can be checked later - proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) - line = proc.stdout.readline() - print("{} {}".format(line, lineNo)) - if line.strip() != "PLF format appears to be correct.": - os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) - invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") - rmLines.write("{}\n".format(lineNo)) - else: - provFile.write(PLFline) - else: - blankPLF.write(timeInfo[0] + "\n") - rmLines.write("{}\n".format(lineNo)) - # Now convert to PLF - lineNo += 1 - -provFile.close() -invalidPLF.close() -blankPLF.close() -rmLines.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh deleted file mode 100755 index 451a7c529fb..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -# Gets lattice oracles -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -if [ $# -lt 3 ]; then - echo "Specify lattice dir, symbol table and text file for partition" - exit 1; -fi - -latticeDir=$1 -textFile=$3 -symTable=$2 -oracleDir=$latticeDir/oracle - -echo $latticeDir -echo $oracleDir - -. ./path.sh - -if [ ! -f $textFile -o ! -f $symTable -o ! -d $latticeDir ]; then - echo "Required files not found" - exit 1; -fi - -mkdir -p $oracleDir - -cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | \ - utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \ - $KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log - -sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py deleted file mode 100755 index fc13a7af701..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# 2018 Saikiran Valluri, GoVivace inc. - -import os, sys - -if len(sys.argv) < 5: - print( "Usage: python get_rnnlm_wordlist.py ") - sys.exit() - -lexicon_words = open(sys.argv[1], 'r', encoding="utf-8") -pocolm_words = open(sys.argv[2], 'r', encoding="utf-8") -rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8") -oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8") - -line_count=0 -lexicon=[] - -for line in lexicon_words: - lexicon.append(line.split()[0]) - rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n') - line_count = line_count + 1 - -for line in pocolm_words: - if not line.split()[0] in lexicon: - oov_wordlist.write(line.split()[0]+'\n') - rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n') - line_count = line_count + 1 - -lexicon_words.close() -pocolm_words.close() -rnnlm_wordsout.close() -oov_wordlist.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py deleted file mode 100644 index 3ecd16772d7..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# 2018 Saikiran Valluri, GoVivace inc. - -import os, sys - -if len(sys.argv) < 3: - print("Usage : python . ") - print(" Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.") - sys.exit() - -pocolmdir=sys.argv[1] -unigramwts=open(sys.argv[2], 'w') - -names = open(pocolmdir+"/names", 'r') -metaparams = open(pocolmdir+"/metaparameters", 'r') - -name_mapper={} -for line in names: - fields=line.split() - name_mapper[fields[0]] = fields[1] - -lns = metaparams.readlines() -for lineno in range(len(name_mapper.keys())): - line = lns[lineno] - fileid = line.split()[0].split("_")[-1] - weight = line.split()[1] - unigramwts.write(name_mapper[fileid] + " " + weight + "\n") - -names.close() -unigramwts.close() -metaparams.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl deleted file mode 100755 index 0366dcdacb0..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# Once the phonetic representation for words is generated by the LDC lexicon -# This script converts them into a KALDI compatible format -# In addition, it extends the list of phonemes to consider based on -# orthograhic representations of those words which do not have stressed vowels - -use utf8; - -($tmpdir)=$ARGV[0]; -open(L, "<", "$tmpdir/lexicon_raw") || die "Can't open raw lexicon"; -open(P, "<" , "$tmpdir/phones") || die "Can't open phone file"; -open(I, ">$tmpdir/lexicon_one_column") || die "Can't open text file for writing"; -open(E, ">$tmpdir/phones_extended") || die "Can't open ex-phone file for writing"; -binmode(P, ":utf8"); -binmode(L, ":utf8"); -binmode(I, ":utf8"); -binmode(E, ":utf8"); - -#Get all phones -my %phones = qw(); -while (

) { - chomp; - $phones{$_} = 1; -} - -print @phones; - -while () { - if (substr($_, 0, 1) eq "#") { - print I $_; - next; - } - $len = length; - $current = 0; - $splitWord = ""; - while ($current < $len) { - #First check for two char codes - $currentChar2 = substr($_, $current, 2); - $currentChar1 = substr($_, $current, 1); - if (exists($phones{$currentChar2})) { - $splitWord = $splitWord . " " . $currentChar2; - $current = $current + 2; - } - else { - # Check if this phone exists - if (!exists($phones{$currentChar1})) { - $phones{$currentChar1} = 1 - } - $splitWord = $splitWord . " " . $currentChar1; - $current = $current + 1; - } - } - $splitWord =~ s/^\s*(.*?)\s*$/$1/; - print I $splitWord, "\n"; -} - -# Now write the phones to the extended phone file -foreach my $key (keys %phones) { - print E $key, "\n"; -} - -close(L); -close(P); -close(I); -close(E); diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh deleted file mode 100755 index bbe0af5810c..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env bash -# Author : Gaurav Kumar, Johns Hopkins University -# Creates OpenFST lattices from Kaldi lattices -# This script needs to be run from one level above this directory - -. ./path.sh - -if [ $# -lt 3 ]; then - echo "Enter the latdir (where the lattices will be put), the decode dir containing lattices and the acoustic scale" - exit 1 -fi - -prunebeam=2 - -latdir=$1 -decode_dir=$2 -acoustic_scale=$3 -#latdir="latjosh-2-callhome" -#decode_dir=exp/tri5a/decode_$partition -#acoustic_scale=0.077 - -stage=0 - -if [ -d $decode_dir ] -then - # TODO:Add scaling factor for weights, how? - rawLatDir="lattices" - compiledLatDir="lattices-bin" - preplfLatDir="lattices-pushed" - - mkdir -p $latdir - mkdir -p $latdir/$rawLatDir - mkdir -p $latdir/$compiledLatDir - mkdir -p $latdir/$preplfLatDir - - for l in $decode_dir/lat.*.gz - do - ( - # Extract file name and unzip the file first - bname=${l##*/} - bname="$latdir/${bname%.gz}" - gunzip -c $l > "$bname.bin" - - if [ $stage -le 0 ]; then - - # Now copy into ark format - $KALDI_ROOT/src/latbin/lattice-copy ark:$bname.bin ark,t:- > "$bname.raw" - - # Prune lattices - $KALDI_ROOT/src/latbin/lattice-prune --acoustic-scale=$acoustic_scale --beam=$prunebeam ark:"$bname.raw" ark:"$bname.pruned" - - # Convert to an openfst compatible format - $KALDI_ROOT/src/latbin/lattice-to-fst --lm-scale=1.0 --acoustic-scale=$acoustic_scale ark:$bname.pruned ark,t:$bname.ark.fst - - fi - - if [ $stage -le 1 ]; then - fileName="" - fileLine=0 - - while read line; do - if [ $fileLine = 0 ]; then - fileName="$line" - fileLine=1 - continue - fi - if [ -z "$line" ]; then - fileLine=0 - continue - fi - # Replace laugh, unk, oov, noise with eps - echo "$line" | awk '{if ($3 == 2038 || $3 == 2039 || $3 == 2040) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat" - done < $bname.ark.fst - echo "Done isolating lattices" - fi - ) & - done - wait - rm $latdir/*.bin - rm $latdir/*.pruned - - - if [ $stage -le 2 ]; then - #Compile lattices - for l in $latdir/$rawLatDir/*.lat - do - ( - # Arc type needs to be log - bname=${l##*/} - fstcompile --arc_type=log $latdir/$rawLatDir/$bname $latdir/$compiledLatDir/$bname - ) & - done - wait - echo "Done compiling lattices." - fi - - if [ $stage -le 3 ]; then - #Sanjeev's Recipe for creating valid PLF compatible FSTs" - # Create a dummy FST with one state and no arcs first - echo 0 | fstcompile --arc_type=log - $latdir/$preplfLatDir/dummy.fst - # Push Lattice weights towards initial state - for l in $latdir/$compiledLatDir/*.lat - do - ( - bname=${l##*/} - fstrmepsilon $latdir/$compiledLatDir/$bname | \ - fstpush --push_weights --remove_total_weight - | \ - # Do not topo sort here, do it before converting into PLF - # Sanjeev's Recipe : Concatenate with dummy FST - fstconcat - $latdir/$preplfLatDir/dummy.fst | \ - fstreverse - | \ - fstrmepsilon - | \ - fstreverse - $latdir/$preplfLatDir/$bname - ) & - done - wait - # Let's take a moment to thank the dummy FST for playing its - # part in this process. However, it has to go now. - rm $latdir/$preplfLatDir/dummy.fst - echo "Done performing fst push (initial state)" - fi -else - echo "Complete training and decoding first" -fi diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py deleted file mode 100755 index 94546dc44c3..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# 2018 Saikiran Valluri, GoVivace inc., Avaaya - -# Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon -from __future__ import print_function -import sys -import re -import json -import codecs -import operator - -wordlimit = 64000 -tmpdir = sys.argv[1] -ldc_lexicon = sys.argv[2] -uw_fisher = tmpdir + "/uniquewords" -uw_gigaword = tmpdir + "/es_wordlist.json" -uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences" - -filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]') -merged_lexicon = [] -# All three lexicons are in different formats -# First add the data from lexicon_fisher (A) into the dictionary -fisher = codecs.open(uw_fisher, encoding='utf-8') -for line in fisher: - merged_lexicon.append(line.strip()) -fisher.close() - -print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon))) - -# Now add data from the LDC lexicon -ldc = codecs.open(uw_LDC, encoding='iso-8859-1') -for line in ldc: - entries = line.strip().split('\t') - if entries[0].lower() not in merged_lexicon: - merged_lexicon.append(entries[0].lower()) - -print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon))) - -# Finally add the gigaword data -gigaword = json.load(open(uw_gigaword)) -gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1))) - -for item in gigaword: - # We need a maximum of wordlimit words in the lexicon - if len(merged_lexicon) == wordlimit: - break - - if item[0].lower() not in merged_lexicon: - merged_lexicon.append(item[0].lower()) - -print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon))) - -# Now write the uniquewords to a file -lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+') -ltuples = sorted(merged_lexicon) - -for item in ltuples: - if not item==u'ñ' and not re.search(filtered_letters, item): - lf.write(item + "\n") - -lf.close() - -print("Finshed writing unique words") diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh deleted file mode 100755 index a95893f698a..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -currentJob=0 - -dir=/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/exp/sgmm2x_6a_denlats - -for f in $dir/.done.*; do - d=`echo ${f##*/} | awk 'BEGIN {FS="."} {print $3}'` - if [ $d -gt $currentJob ]; then - currentJob=$d - fi -done - -currentJob=$((currentJob+1)) - -echo Currently processing job : $currentJob - -for i in $(seq 210); do - job[$i]=$i -done - -dir=/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/exp/sgmm2x_6a_denlats/log/$currentJob/q - -for f in $dir/done.*; do - d=`echo ${f##*/} | awk 'BEGIN {FS="."} {print $3}'` - unset job[$d] -done - -echo sub-splits left : ${#job[@]} -echo ${job[@]} diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh deleted file mode 100755 index cc9de4d26c5..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh +++ /dev/null @@ -1,187 +0,0 @@ -#!/bin/bash - -set -e -o pipefail - -# This script is called from scripts like local/nnet3/run_tdnn.sh and -# local/chain/run_tdnn.sh (and may eventually be called by more scripts). It -# contains the common feature preparation and iVector-related parts of the -# script. See those scripts for examples of usage. - - -stage=7 -nj=30 -train_set=train # you might set this to e.g. train. -test_sets="test dev" -gmm=tri5a # This specifies a GMM-dir from the features of the type you're training the system on; - # it should contain alignments for 'train_set'. - -num_threads_ubm=32 -nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff in (e.g. - # in the tedlium recip it's _cleaned). - -. ./cmd.sh -. ./path.sh -. utils/parse_options.sh - - -gmm_dir=exp/${gmm} -ali_dir=exp/${gmm}_ali_${train_set}_sp - -for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do - if [ ! -f $f ]; then - echo "$0: expected file $f to exist" - exit 1 - fi -done - - - -if [ $stage -le 7 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then - echo "$0: data/${train_set}_sp_hires/feats.scp already exists." - echo " ... Please either remove it, or rerun this script with stage > 7." - exit 1 -fi - - -if [ $stage -le 8 ]; then - echo "$0: preparing directory for speed-perturbed data" - utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp -fi - -if [ $stage -le 9 ]; then - echo "$0: creating high-resolution MFCC features" - - # this shows how you can split across multiple file-systems. we'll split the - # MFCC dir across multiple locations. You might want to be careful here, if you - # have multiple copies of Kaldi checked out and run the same recipe, not to let - # them overwrite each other. - mfccdir=data/${train_set}_sp_hires/data - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then - utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage - fi - - for datadir in ${train_set}_sp ${test_sets}; do - utils/copy_data_dir.sh data/$datadir data/${datadir}_hires - done - - # do volume-perturbation on the training data prior to extracting hires - # features; this helps make trained nnets more invariant to test data volume. - utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires - - for datadir in ${train_set}_sp ${test_sets}; do - steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires - steps/compute_cmvn_stats.sh data/${datadir}_hires - utils/fix_data_dir.sh data/${datadir}_hires - done -fi - -if [ $stage -le 10 ]; then - echo "$0: computing a subset of data to train the diagonal UBM." - - mkdir -p exp/nnet3${nnet3_affix}/diag_ubm - temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm - - # train a diagonal UBM using a subset of about a quarter of the data - num_utts_total=$(wc -l in the history of a n-gram -# un-comment the following line -#limit_unk_history_opt="--limit-unk-history=true" - -for order in ${ngram_order}; do - # decide on the vocabulary. - # Note: you'd use --wordlist if you had a previously determined word-list - # that you wanted to use. - lm_name="${num_word}_${order}" - min_counts='' - # Note: the following might be a more reasonable setting: - # min_counts='fisher=2 swbd1=1' - if [ -n "${min_counts}" ]; then - lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" - fi - unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm - train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \ - --min-counts=${min_counts} \ - --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \ - ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir} - - if [ $pocolm_stage -eq 2 ];then - mkdir -p ${arpa_dir} - format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz - - # example of pruning. note: the threshold can be less than or more than one. - get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' - for threshold in 1.0 2.0 4.0; do - pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm - prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3 - get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity' - - format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz - - done - - # example of pruning by size. - size=1000000 - pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm - prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes' - get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity' - - format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz - fi -done - -# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 ) - -# the following does does some self-testing, including -# that the computed derivatives are accurate. -# local/self_test.sh - -# perplexities from pocolm-estimated language models with pocolm's interpolation -# method from orders 3, 4, and 5 are: -# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689) -# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797) -# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181) - -# note, the perplexities from pocolm-estimated language models with SRILM's -# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh), -# 78.8449 and 75.2202 respectively. - -# note, the perplexities from SRILM-estimated language models with SRILM's -# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh), -# 78.9056 and 75.5528 respectively. diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py b/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py deleted file mode 100755 index 5c68e1204b2..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -# Processes lattice oracles - -import os -import sys - -oracleDir = "exp/tri5a/decode_callhome_train/oracle" -wordsFile = open('exp/sgmm2x_6a/graph/words.txt') -conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/train') -oracleTmp = 'exp/tri5a/one-best/oracle-ch-train' -provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/oracle.train', 'w+') -timLocation = '/export/a04/gkumar/corpora/fishcall/callhome/tim' - -def findTranscription(timeDetail): - file1 = open(oracleDir + "/oracle.tra") - for line in file1: - lineComp = line.split() - if lineComp[0] == timeDetail: - return " ".join(lineComp[1:]) - # No result found - return -1 - -words = {} - -# Extract word list -for line in wordsFile: - lineComp = line.split() - words[int(lineComp[1])] = lineComp[0].strip() - -# Now read list of files in conversations -fileList = [] -for line in conversationList: - line = line.strip() - line = line[:-4] - fileList.append(line) - -# IN what order were the conversations added to the spanish files? -# TODO: Make sure they match the order in which these english files are being written - -# Now get timing information to concatenate the ASR outputs -if not os.path.exists(oracleTmp): - os.makedirs(oracleTmp) - -#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+') -for item in fileList: - timingFile = open(timLocation + '/' + item + '.es') - newFile = open(oracleTmp + '/' + item + '.es', 'w+') - for line in timingFile: - timeInfo = line.split() - mergedTranslation = "" - for timeDetail in timeInfo: - #Locate this in ASR dev/test, this is going to be very slow - tmp = findTranscription(timeDetail) - if tmp != -1: - mergedTranslation = mergedTranslation + " " + tmp - mergedTranslation = mergedTranslation.strip() - transWords = [words[int(x)] for x in mergedTranslation.split()] - newFile.write(" ".join(transWords) + "\n") - provFile.write(" ".join(transWords) + "\n") - - newFile.close() -provFile.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh deleted file mode 100755 index 1b54b304e50..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -. ./cmd.sh - -for iter in 1 2 3 4; do - steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri5a/decode_test data/lang data/test exp/sgmm2x_6a/decode_test_fmllr \ - exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it$iter & -done - - -for iter in 1 2 3 4; do - steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri5a/decode_dev data/lang data/dev exp/sgmm2x_6a/decode_dev_fmllr \ - exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it$iter & -done - - -for iter in 1 2 3 4; do - steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri5a/decode_dev2 data/lang data/dev2 exp/sgmm2x_6a/decode_dev2_fmllr \ - exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it$iter & -done diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh deleted file mode 100755 index 3850910f312..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (author: Daniel Povey) -# 2015 Guoguo Chen -# 2017 Hainan Xu -# 2017 Xiaohui Zhang - -# This script trains LMs on the swbd LM-training data. - -# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration. -# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0. -# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71 -# Dev objf: -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91 - - -dir=Spanish_gigawrd/rnnlm -pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned -wordslist= -embedding_dim=1024 -lstm_rpd=256 -lstm_nrpd=256 -stage=0 -train_stage=-30 -text_dir=Spanish_gigawrd/text_lm - -. ./cmd.sh -. ./utils/parse_options.sh - -mkdir -p $dir/config -set -e - -for f in $text_dir/dev.txt; do - [ ! -f $f ] && \ - echo "$0: expected file $f to exist;" && exit 1 -done - -if [ $stage -le 0 ]; then - if [ -f $text_dir/unigram_weights ] ; then - mv $text_dir/unigram_weights $pocolm_dir/ - fi - cp $wordslist $dir/config/words.txt - n=`cat $dir/config/words.txt | wc -l` - echo " $n" >> $dir/config/words.txt - - # words that are not present in words.txt but are in the training or dev data, will be - # mapped to during training. - echo "" >$dir/config/oov.txt - local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt - rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \ - --unk-word="" \ - --data-weights-file=$dir/config/data_weights.txt \ - $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt - - # choose features - rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ - --use-constant-feature=true \ - --special-words=',,,,[noise],[laughter]' \ - $dir/config/words.txt > $dir/config/features.txt -fi - -if [ $stage -le 1 ]; then - cat <$dir/config/xconfig - input dim=$embedding_dim name=input - relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1)) - fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd - relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3)) - fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd - relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3)) - output-layer name=output include-log-softmax=false dim=$embedding_dim -EOF - rnnlm/validate_config_dir.sh $text_dir $dir/config -fi - -if [ $stage -le 2 ]; then - rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir -fi - -if [ $stage -le 3 ]; then - rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \ - --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir -fi - -exit 0 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh deleted file mode 100755 index f88fecc815c..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -punctuation_symbols=( "," "\"" "\`" "\:" "(" ")" "-" ";" "?" "!" "/" "_" "{" "}" "*" ) - -config=$1 -path_prefix=$2 -data=$3 -job=$4 -dir=$5 - -substitute_arg="" -num_syms=0 - -for i in "${punctuation_symbols[@]}"; do - symbol=${punctuation_symbols[${num_syms}]} - if [ $num_syms -eq 0 ]; then - substitute_arg="sed 's:${i}: :g'" - else - substitute_arg=$substitute_arg" | sed 's:${i}: :g'" - fi - substitute_arg=$substitute_arg" |sed 's:${i}$: :g' | sed 's:^${i}: :g'" - num_syms=$((num_syms+1)) -done -mkdir -p $dir/normalize/$job -local/clean_abbrevs_text.py $data/$job $data/"$job"_processed -mv $data/"$job"_processed $data/$job -echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh - -bash $dir/normalize/$job/substitute.sh | \ - sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \ - sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' > $dir/normalize/$job/text -normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt - -exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh deleted file mode 100755 index 9148b1f1171..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -# This is as run_sgmm2.sh but excluding the "speaker-dependent weights", -# so not doing the symmetric SGMM. - -. ./cmd.sh - -## SGMM on top of LDA+MLLT+SAT features. -if [ ! -f exp/ubm6a/final.mdl ]; then - steps/train_ubm.sh --silence-weight 0.5 --cmd "$train_cmd" 800 data/train data/lang exp/tri5a_ali exp/ubm6a || exit 1; -fi -# Double the number of SAT states : sanjeev -steps/train_sgmm2.sh --spk-dep-weights false --cmd "$train_cmd" 10000 120000 \ - data/train data/lang exp/tri5a_ali exp/ubm6a/final.ubm exp/sgmm2x_6a || exit 1; - -utils/mkgraph.sh data/lang_test exp/sgmm2x_6a exp/sgmm2x_6a/graph || exit 1; - -steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ - --transform-dir exp/tri5a/decode_dev exp/sgmm2x_6a/graph data/dev exp/sgmm2x_6a/decode_dev || exit 1; - -steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ - --transform-dir exp/tri5a/decode_dev exp/sgmm2x_6a/graph data/dev exp/sgmm2x_6a/decode_dev_fmllr || exit 1; - -steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ - --transform-dir exp/tri5a/decode_test exp/sgmm2x_6a/graph data/test exp/sgmm2x_6a/decode_test || exit 1; - -steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ - --transform-dir exp/tri5a/decode_test exp/sgmm2x_6a/graph data/test exp/sgmm2x_6a/decode_test_fmllr || exit 1; - -steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ - --transform-dir exp/tri5a/decode_dev2 exp/sgmm2x_6a/graph data/dev2 exp/sgmm2x_6a/decode_dev2 || exit 1; - -steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \ - --transform-dir exp/tri5a/decode_dev2 exp/sgmm2x_6a/graph data/dev2 exp/sgmm2x_6a/decode_dev2_fmllr || exit 1; - - # Now we'll align the SGMM system to prepare for discriminative training. - steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri5a \ - --use-graphs true --use-gselect true data/train data/lang exp/sgmm2x_6a exp/sgmm2x_6a_ali || exit 1; - steps/make_denlats_sgmm2.sh --nj 30 --sub-split 210 --cmd "$decode_cmd" --transform-dir exp/tri5a \ - data/train data/lang exp/sgmm2x_6a_ali exp/sgmm2x_6a_denlats - steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri5a --boost 0.2 \ - data/train data/lang exp/sgmm2x_6a_ali exp/sgmm2x_6a_denlats exp/sgmm2x_6a_mmi_b0.2 - - for iter in 1 2 3 4; do - steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri5a/decode_test data/lang data/test exp/sgmm2x_6a/decode_test exp/sgmm2x_6a_mmi_b0.2/decode_test_it$iter & - done - -wait -steps/decode_combine.sh data/test data/lang exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode || exit 1; -steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_mmi/decode exp/combine_sgmm2x_4a_3b/decode || exit 1; -# combining the sgmm run and the best MMI+fMMI run. -steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_fmmi_c/decode_it5 exp/combine_sgmm2x_4a_3b_fmmic5/decode || exit 1; - -steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a_mmi_b0.2/decode_it4 exp/tri3b_fmmi_c/decode_it5 exp/combine_sgmm2x_4a_mmi_3b_fmmic5/decode || exit 1; - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh deleted file mode 120000 index 0afefc3158c..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh +++ /dev/null @@ -1 +0,0 @@ -../steps/score_kaldi.sh \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh deleted file mode 100755 index 21b793a4d27..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -oracle_dir=exp/tri5a/decode_callhome_test/oracle -split=callhome_test -data_dir=data/callhome_test -lang_dir=data/lang - -# Make sure that your STM and CTM files are in UTF-8 encoding -# Any other encoding will cause this script to fail/misbehave - -if [ ! -e $oracle_dir -o ! -e $data_dir -o ! -e $lang_dir ]; then - echo "Missing pre-requisites" - exit 1 -fi - -for i in {5..20}; do - mkdir -p $oracle_dir/score_$i - cp $oracle_dir/$split.ctm $oracle_dir/score_$i/ -done - -. /export/babel/data/software/env.sh - -# Start scoring -/export/a11/guoguo/babel/103-bengali-limitedLP.official/local/score_stm.sh $data_dir $lang_dir \ - $oracle_dir - -# Print a summary of the result -grep "Percent Total Error" $oracle_dir/score_*/$split.ctm.dtl diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev deleted file mode 100644 index 77e3b01786f..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev +++ /dev/null @@ -1,20 +0,0 @@ -sp_0897.sph -sp_0968.sph -sp_0981.sph -sp_1062.sph -sp_1292.sph -sp_1411.sph -sp_1413.sph -sp_1552.sph -sp_1554.sph -sp_1805.sph -sp_1808.sph -sp_1882.sph -sp_1930.sph -sp_1947.sph -sp_2037.sph -sp_2054.sph -sp_2057.sph -sp_2107.sph -sp_2109.sph -sp_2144.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev deleted file mode 100644 index 77e3b01786f..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev +++ /dev/null @@ -1,20 +0,0 @@ -sp_0897.sph -sp_0968.sph -sp_0981.sph -sp_1062.sph -sp_1292.sph -sp_1411.sph -sp_1413.sph -sp_1552.sph -sp_1554.sph -sp_1805.sph -sp_1808.sph -sp_1882.sph -sp_1930.sph -sp_1947.sph -sp_2037.sph -sp_2054.sph -sp_2057.sph -sp_2107.sph -sp_2109.sph -sp_2144.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test deleted file mode 100644 index 0cbc3cc95fd..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test +++ /dev/null @@ -1,20 +0,0 @@ -sp_0053.sph -sp_0082.sph -sp_0084.sph -sp_0088.sph -sp_0681.sph -sp_0699.sph -sp_0776.sph -sp_0857.sph -sp_1031.sph -sp_1100.sph -sp_1148.sph -sp_1156.sph -sp_1186.sph -sp_1212.sph -sp_1345.sph -sp_1435.sph -sp_1578.sph -sp_1648.sph -sp_1807.sph -sp_1847.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train deleted file mode 100644 index 2c936072534..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train +++ /dev/null @@ -1,80 +0,0 @@ -sp_0085.sph -sp_0096.sph -sp_0098.sph -sp_0100.sph -sp_0291.sph -sp_0713.sph -sp_0724.sph -sp_0726.sph -sp_0731.sph -sp_0733.sph -sp_0753.sph -sp_0788.sph -sp_0826.sph -sp_0831.sph -sp_0836.sph -sp_0841.sph -sp_0850.sph -sp_0855.sph -sp_0892.sph -sp_0899.sph -sp_0910.sph -sp_0917.sph -sp_0919.sph -sp_0923.sph -sp_0945.sph -sp_0950.sph -sp_0951.sph -sp_0992.sph -sp_0997.sph -sp_1013.sph -sp_1039.sph -sp_1044.sph -sp_1045.sph -sp_1058.sph -sp_1060.sph -sp_1063.sph -sp_1081.sph -sp_1106.sph -sp_1122.sph -sp_1140.sph -sp_1175.sph -sp_1195.sph -sp_1198.sph -sp_1231.sph -sp_1234.sph -sp_1255.sph -sp_1260.sph -sp_1261.sph -sp_1262.sph -sp_1264.sph -sp_1266.sph -sp_1273.sph -sp_1275.sph -sp_1284.sph -sp_1286.sph -sp_1304.sph -sp_1308.sph -sp_1333.sph -sp_1341.sph -sp_1353.sph -sp_1368.sph -sp_1379.sph -sp_1384.sph -sp_1449.sph -sp_1463.sph -sp_1574.sph -sp_1740.sph -sp_1759.sph -sp_1849.sph -sp_1908.sph -sp_1915.sph -sp_1918.sph -sp_1974.sph -sp_1976.sph -sp_1988.sph -sp_2000.sph -sp_2056.sph -sp_2070.sph -sp_2091.sph -sp_2101.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev deleted file mode 100644 index d3769f0ffb5..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev +++ /dev/null @@ -1,20 +0,0 @@ -20051009_182032_217_fsp.sph -20051009_210519_219_fsp.sph -20051010_212418_225_fsp.sph -20051016_180547_265_fsp.sph -20051016_210626_267_fsp.sph -20051017_180712_270_fsp.sph -20051017_220530_275_fsp.sph -20051017_234550_276_fsp.sph -20051018_210220_279_fsp.sph -20051018_210744_280_fsp.sph -20051019_190221_288_fsp.sph -20051019_210146_289_fsp.sph -20051019_230329_292_fsp.sph -20051022_180817_311_fsp.sph -20051023_232057_325_fsp.sph -20051024_180453_327_fsp.sph -20051024_181110_329_fsp.sph -20051025_212334_337_fsp.sph -20051026_180724_341_fsp.sph -20051026_211309_346_fsp.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 deleted file mode 100644 index f1b5c293d67..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 +++ /dev/null @@ -1,20 +0,0 @@ -20050909_210655_26_fsp.sph -20050910_210708_33_fsp.sph -20050913_210933_49_fsp.sph -20050913_211649_50_fsp.sph -20050915_210434_65_fsp.sph -20050916_180332_68_fsp.sph -20050918_180733_81_fsp.sph -20050918_210841_82_fsp.sph -20050920_212030_93_fsp.sph -20050921_210443_99_fsp.sph -20050923_211304_115_fsp.sph -20050925_180713_120_fsp.sph -20050925_180825_121_fsp.sph -20050926_180516_125_fsp.sph -20050926_180555_126_fsp.sph -20050928_000254_141_fsp.sph -20050930_210540_161_fsp.sph -20051002_180726_170_fsp.sph -20051007_181850_205_fsp.sph -20051007_191217_206_fsp.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test deleted file mode 100644 index 6190ced077c..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test +++ /dev/null @@ -1,20 +0,0 @@ -20051028_180633_356_fsp.sph -20051029_211606_365_fsp.sph -20051030_193924_371_fsp.sph -20051101_212731_386_fsp.sph -20051102_134901_389_fsp.sph -20051102_180402_391_fsp.sph -20051102_181501_393_fsp.sph -20051103_211105_404_fsp.sph -20051103_233456_406_fsp.sph -20051107_184634_438_fsp.sph -20051109_180253_445_fsp.sph -20051109_210353_450_fsp.sph -20051111_181045_470_fsp.sph -20051111_182216_472_fsp.sph -20051112_181649_485_fsp.sph -20051113_155059_492_fsp.sph -20051113_210221_496_fsp.sph -20051113_214925_498_fsp.sph -20051114_181749_505_fsp.sph -20051115_212123_516_fsp.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train deleted file mode 100644 index b57683842b2..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train +++ /dev/null @@ -1,759 +0,0 @@ -20050908_182943_22_fsp.sph -20050908_191808_23_fsp.sph -20050909_210428_25_fsp.sph -20050909_221657_28_fsp.sph -20050910_180310_29_fsp.sph -20050910_180330_30_fsp.sph -20050910_181354_31_fsp.sph -20050910_190223_32_fsp.sph -20050911_180647_34_fsp.sph -20050911_200216_35_fsp.sph -20050911_210429_36_fsp.sph -20050911_210530_37_fsp.sph -20050911_210904_38_fsp.sph -20050912_181441_40_fsp.sph -20050912_181538_41_fsp.sph -20050912_182044_42_fsp.sph -20050912_212913_43_fsp.sph -20050913_180324_44_fsp.sph -20050913_180731_46_fsp.sph -20050913_180947_47_fsp.sph -20050913_210409_48_fsp.sph -20050914_000831_51_fsp.sph -20050914_180332_52_fsp.sph -20050914_180606_53_fsp.sph -20050914_181020_54_fsp.sph -20050914_210243_55_fsp.sph -20050914_210822_56_fsp.sph -20050914_220753_58_fsp.sph -20050915_180728_60_fsp.sph -20050915_180740_61_fsp.sph -20050915_192457_62_fsp.sph -20050915_194045_63_fsp.sph -20050915_210200_64_fsp.sph -20050915_210916_66_fsp.sph -20050915_212325_67_fsp.sph -20050916_180740_69_fsp.sph -20050916_200334_70_fsp.sph -20050916_210235_71_fsp.sph -20050916_210510_72_fsp.sph -20050916_223656_73_fsp.sph -20050917_210406_74_fsp.sph -20050917_210805_75_fsp.sph -20050917_211045_76_fsp.sph -20050917_212041_77_fsp.sph -20050918_180326_80_fsp.sph -20050919_000612_83_fsp.sph -20050919_180511_84_fsp.sph -20050919_180703_85_fsp.sph -20050919_180925_86_fsp.sph -20050919_190254_87_fsp.sph -20050920_180330_88_fsp.sph -20050920_180342_89_fsp.sph -20050920_180607_90_fsp.sph -20050920_181919_91_fsp.sph -20050920_211414_92_fsp.sph -20050920_230520_94_fsp.sph -20050921_180639_95_fsp.sph -20050921_181002_96_fsp.sph -20050921_210340_98_fsp.sph -20050921_211329_101_fsp.sph -20050921_221625_102_fsp.sph -20050922_180618_103_fsp.sph -20050922_180948_104_fsp.sph -20050922_210740_106_fsp.sph -20050922_211003_107_fsp.sph -20050922_230412_108_fsp.sph -20050923_180514_110_fsp.sph -20050923_180530_111_fsp.sph -20050923_210442_114_fsp.sph -20050924_180747_117_fsp.sph -20050924_181124_118_fsp.sph -20050925_210645_122_fsp.sph -20050925_231407_123_fsp.sph -20050926_000425_124_fsp.sph -20050926_180719_127_fsp.sph -20050926_220244_130_fsp.sph -20050926_230706_131_fsp.sph -20050927_180422_132_fsp.sph -20050927_181033_133_fsp.sph -20050927_181232_134_fsp.sph -20050927_210320_135_fsp.sph -20050927_210848_136_fsp.sph -20050927_210947_138_fsp.sph -20050927_211929_139_fsp.sph -20050927_231016_140_fsp.sph -20050928_180631_142_fsp.sph -20050928_210256_144_fsp.sph -20050928_210700_145_fsp.sph -20050928_211113_146_fsp.sph -20050928_220320_147_fsp.sph -20050928_232236_148_fsp.sph -20050929_180318_149_fsp.sph -20050929_180722_150_fsp.sph -20050929_180932_151_fsp.sph -20050929_211337_153_fsp.sph -20050929_220820_154_fsp.sph -20050929_230406_155_fsp.sph -20050930_180329_156_fsp.sph -20050930_180411_157_fsp.sph -20050930_180646_158_fsp.sph -20050930_200308_159_fsp.sph -20051001_180328_163_fsp.sph -20051001_181004_164_fsp.sph -20051001_210749_166_fsp.sph -20051001_211346_167_fsp.sph -20051002_180339_169_fsp.sph -20051002_210324_171_fsp.sph -20051002_220651_174_fsp.sph -20051003_180434_175_fsp.sph -20051003_211042_178_fsp.sph -20051003_220633_179_fsp.sph -20051004_180351_180_fsp.sph -20051004_180542_181_fsp.sph -20051004_180730_182_fsp.sph -20051004_200737_183_fsp.sph -20051004_211611_185_fsp.sph -20051005_180420_187_fsp.sph -20051005_180709_188_fsp.sph -20051005_213606_191_fsp.sph -20051005_220917_192_fsp.sph -20051005_230659_193_fsp.sph -20051006_180416_194_fsp.sph -20051006_180653_195_fsp.sph -20051006_180815_196_fsp.sph -20051006_181525_197_fsp.sph -20051006_183153_199_fsp.sph -20051006_210246_200_fsp.sph -20051006_210417_201_fsp.sph -20051006_220329_203_fsp.sph -20051008_000036_208_fsp.sph -20051008_180249_209_fsp.sph -20051008_181720_210_fsp.sph -20051008_183224_211_fsp.sph -20051008_190256_212_fsp.sph -20051008_211712_214_fsp.sph -20051008_213416_215_fsp.sph -20051009_180444_216_fsp.sph -20051009_190753_218_fsp.sph -20051009_220443_221_fsp.sph -20051010_180650_222_fsp.sph -20051010_182706_223_fsp.sph -20051010_210622_224_fsp.sph -20051010_222853_227_fsp.sph -20051010_231630_228_fsp.sph -20051011_181919_230_fsp.sph -20051011_211026_232_fsp.sph -20051011_220348_233_fsp.sph -20051012_180233_234_fsp.sph -20051012_190241_236_fsp.sph -20051012_193952_237_fsp.sph -20051012_224157_239_fsp.sph -20051013_180458_240_fsp.sph -20051013_180613_241_fsp.sph -20051013_180700_242_fsp.sph -20051013_182213_244_fsp.sph -20051013_210221_245_fsp.sph -20051013_210425_246_fsp.sph -20051013_210941_247_fsp.sph -20051013_220243_248_fsp.sph -20051014_180259_249_fsp.sph -20051014_180940_250_fsp.sph -20051014_180948_251_fsp.sph -20051014_183707_252_fsp.sph -20051014_210348_253_fsp.sph -20051014_210647_254_fsp.sph -20051014_220227_256_fsp.sph -20051014_230339_257_fsp.sph -20051015_180549_258_fsp.sph -20051015_190247_259_fsp.sph -20051015_210138_260_fsp.sph -20051015_210701_261_fsp.sph -20051015_210831_262_fsp.sph -20051016_180926_266_fsp.sph -20051017_000346_269_fsp.sph -20051017_210137_273_fsp.sph -20051017_215732_274_fsp.sph -20051018_180559_277_fsp.sph -20051018_180816_278_fsp.sph -20051018_211701_282_fsp.sph -20051018_231046_283_fsp.sph -20051018_235317_284_fsp.sph -20051019_180448_285_fsp.sph -20051019_183344_287_fsp.sph -20051020_180339_293_fsp.sph -20051020_180759_295_fsp.sph -20051020_210218_297_fsp.sph -20051020_212525_299_fsp.sph -20051020_222944_300_fsp.sph -20051020_234953_301_fsp.sph -20051021_180218_302_fsp.sph -20051021_180508_303_fsp.sph -20051021_190605_304_fsp.sph -20051021_210159_305_fsp.sph -20051021_210530_306_fsp.sph -20051021_222225_307_fsp.sph -20051022_001311_309_fsp.sph -20051022_180452_310_fsp.sph -20051022_180829_312_fsp.sph -20051022_190406_313_fsp.sph -20051022_200517_314_fsp.sph -20051022_210920_315_fsp.sph -20051022_230324_316_fsp.sph -20051022_232428_317_fsp.sph -20051023_180342_318_fsp.sph -20051023_180530_319_fsp.sph -20051023_190301_321_fsp.sph -20051023_210258_322_fsp.sph -20051023_210605_323_fsp.sph -20051023_223751_324_fsp.sph -20051024_000348_326_fsp.sph -20051024_180624_328_fsp.sph -20051024_210748_330_fsp.sph -20051024_211346_331_fsp.sph -20051024_221753_332_fsp.sph -20051024_230857_333_fsp.sph -20051025_180351_334_fsp.sph -20051025_210532_335_fsp.sph -20051025_210959_336_fsp.sph -20051025_220419_338_fsp.sph -20051026_180611_340_fsp.sph -20051026_190359_343_fsp.sph -20051026_210334_344_fsp.sph -20051026_211202_345_fsp.sph -20051026_230956_347_fsp.sph -20051026_234001_348_fsp.sph -20051027_180217_349_fsp.sph -20051027_210159_351_fsp.sph -20051027_210333_352_fsp.sph -20051027_211525_353_fsp.sph -20051027_231329_354_fsp.sph -20051028_180329_355_fsp.sph -20051028_210350_358_fsp.sph -20051028_211904_359_fsp.sph -20051029_200218_363_fsp.sph -20051029_210442_364_fsp.sph -20051029_220538_366_fsp.sph -20051030_000333_367_fsp.sph -20051030_180521_368_fsp.sph -20051030_181001_369_fsp.sph -20051030_190231_370_fsp.sph -20051030_210903_372_fsp.sph -20051030_230444_373_fsp.sph -20051031_180213_374_fsp.sph -20051031_180906_375_fsp.sph -20051031_210229_377_fsp.sph -20051031_220447_379_fsp.sph -20051101_153940_380_fsp.sph -20051101_211314_384_fsp.sph -20051101_223911_387_fsp.sph -20051101_230216_388_fsp.sph -20051102_175957_390_fsp.sph -20051102_210243_394_fsp.sph -20051102_210828_395_fsp.sph -20051102_211130_396_fsp.sph -20051103_163507_398_fsp.sph -20051103_180920_400_fsp.sph -20051103_185102_401_fsp.sph -20051103_210539_403_fsp.sph -20051103_223906_405_fsp.sph -20051104_123901_407_fsp.sph -20051104_180145_408_fsp.sph -20051104_181437_409_fsp.sph -20051104_190247_410_fsp.sph -20051104_210307_411_fsp.sph -20051104_210814_412_fsp.sph -20051104_212121_413_fsp.sph -20051104_222117_414_fsp.sph -20051104_231424_416_fsp.sph -20051105_175657_418_fsp.sph -20051105_181203_419_fsp.sph -20051105_210724_421_fsp.sph -20051105_220745_422_fsp.sph -20051106_180232_424_fsp.sph -20051106_181321_425_fsp.sph -20051106_190219_426_fsp.sph -20051106_200213_427_fsp.sph -20051106_210215_428_fsp.sph -20051106_210310_429_fsp.sph -20051106_211252_430_fsp.sph -20051106_211804_431_fsp.sph -20051106_215339_432_fsp.sph -20051106_221653_433_fsp.sph -20051107_115855_434_fsp.sph -20051107_160351_435_fsp.sph -20051107_180332_436_fsp.sph -20051107_182401_437_fsp.sph -20051107_210309_439_fsp.sph -20051107_212723_440_fsp.sph -20051108_145902_441_fsp.sph -20051108_181424_442_fsp.sph -20051108_210224_443_fsp.sph -20051108_212018_444_fsp.sph -20051109_180413_446_fsp.sph -20051109_181432_447_fsp.sph -20051109_181906_448_fsp.sph -20051109_183631_449_fsp.sph -20051109_210436_451_fsp.sph -20051109_211151_452_fsp.sph -20051109_212148_453_fsp.sph -20051109_232505_454_fsp.sph -20051110_155523_455_fsp.sph -20051110_180208_456_fsp.sph -20051110_180838_457_fsp.sph -20051110_182221_459_fsp.sph -20051110_182318_460_fsp.sph -20051110_210200_461_fsp.sph -20051110_210233_462_fsp.sph -20051110_210454_463_fsp.sph -20051110_211110_464_fsp.sph -20051110_212818_466_fsp.sph -20051110_225245_467_fsp.sph -20051111_181441_471_fsp.sph -20051111_184451_474_fsp.sph -20051111_190326_475_fsp.sph -20051111_194004_477_fsp.sph -20051111_201357_478_fsp.sph -20051111_230329_480_fsp.sph -20051112_000305_482_fsp.sph -20051112_165916_483_fsp.sph -20051112_185651_487_fsp.sph -20051112_190443_488_fsp.sph -20051112_210205_489_fsp.sph -20051112_210631_490_fsp.sph -20051112_231502_491_fsp.sph -20051113_180809_493_fsp.sph -20051113_210908_497_fsp.sph -20051113_220433_499_fsp.sph -20051114_171942_502_fsp.sph -20051114_181118_504_fsp.sph -20051114_210412_506_fsp.sph -20051114_212032_507_fsp.sph -20051114_215057_508_fsp.sph -20051114_220412_509_fsp.sph -20051114_225557_510_fsp.sph -20051115_134012_511_fsp.sph -20051115_180301_512_fsp.sph -20051115_181412_513_fsp.sph -20051115_181731_514_fsp.sph -20051115_182149_515_fsp.sph -20051115_213551_517_fsp.sph -20051115_215935_518_fsp.sph -20051115_230749_520_fsp.sph -20051116_000221_521_fsp.sph -20051116_172353_522_fsp.sph -20051116_180237_524_fsp.sph -20051116_181228_525_fsp.sph -20051116_181816_526_fsp.sph -20051116_190450_527_fsp.sph -20051116_210146_528_fsp.sph -20051116_210553_529_fsp.sph -20051116_211222_530_fsp.sph -20051116_212312_531_fsp.sph -20051116_222454_532_fsp.sph -20051116_233038_533_fsp.sph -20051117_001013_534_fsp.sph -20051117_180234_535_fsp.sph -20051117_181844_537_fsp.sph -20051117_210156_538_fsp.sph -20051117_210403_539_fsp.sph -20051117_211540_540_fsp.sph -20051117_211833_541_fsp.sph -20051117_212855_542_fsp.sph -20051117_213407_543_fsp.sph -20051117_220412_544_fsp.sph -20051117_225943_545_fsp.sph -20051118_180619_547_fsp.sph -20051118_180739_548_fsp.sph -20051118_182114_549_fsp.sph -20051118_182652_550_fsp.sph -20051118_210212_551_fsp.sph -20051118_210455_552_fsp.sph -20051118_212058_553_fsp.sph -20051118_212829_554_fsp.sph -20051119_000355_555_fsp.sph -20051119_181105_556_fsp.sph -20051119_210802_557_fsp.sph -20051119_212315_559_fsp.sph -20051119_214926_560_fsp.sph -20051120_181008_561_fsp.sph -20051120_181339_562_fsp.sph -20051120_190412_563_fsp.sph -20051120_205645_565_fsp.sph -20051120_210347_566_fsp.sph -20051120_211526_567_fsp.sph -20051121_181138_569_fsp.sph -20051121_181357_570_fsp.sph -20051121_190155_571_fsp.sph -20051121_210922_573_fsp.sph -20051122_181114_574_fsp.sph -20051122_190326_576_fsp.sph -20051122_210253_577_fsp.sph -20051122_210703_578_fsp.sph -20051122_211805_579_fsp.sph -20051122_213037_580_fsp.sph -20051122_215430_581_fsp.sph -20051123_180926_582_fsp.sph -20051123_181644_583_fsp.sph -20051123_210214_584_fsp.sph -20051123_211514_585_fsp.sph -20051123_212412_586_fsp.sph -20051123_213259_587_fsp.sph -20051124_181720_588_fsp.sph -20051124_190336_589_fsp.sph -20051124_212221_591_fsp.sph -20051124_220457_592_fsp.sph -20051125_181632_593_fsp.sph -20051125_190327_594_fsp.sph -20051125_212150_595_fsp.sph -20051126_181804_597_fsp.sph -20051126_190347_598_fsp.sph -20051126_210222_599_fsp.sph -20051127_181335_601_fsp.sph -20051127_190405_602_fsp.sph -20051127_210516_603_fsp.sph -20051127_211200_604_fsp.sph -20051127_212516_605_fsp.sph -20051128_215149_608_fsp.sph -20051128_222007_609_fsp.sph -20051129_180204_610_fsp.sph -20051129_181241_612_fsp.sph -20051129_181547_613_fsp.sph -20051129_183449_614_fsp.sph -20051129_190152_615_fsp.sph -20051129_210218_616_fsp.sph -20051129_210342_617_fsp.sph -20051129_212711_618_fsp.sph -20051130_181543_619_fsp.sph -20051130_182626_620_fsp.sph -20051130_210202_622_fsp.sph -20051130_210910_623_fsp.sph -20051130_212724_626_fsp.sph -20051130_220121_627_fsp.sph -20051130_221538_628_fsp.sph -20051201_181034_630_fsp.sph -20051201_181303_631_fsp.sph -20051201_183429_632_fsp.sph -20051201_191426_633_fsp.sph -20051201_193415_634_fsp.sph -20051201_195005_635_fsp.sph -20051201_210713_636_fsp.sph -20051201_212329_637_fsp.sph -20051201_230640_638_fsp.sph -20051202_181119_639_fsp.sph -20051202_181659_640_fsp.sph -20051202_182058_641_fsp.sph -20051202_184713_642_fsp.sph -20051202_190154_643_fsp.sph -20051202_193515_644_fsp.sph -20051202_210252_645_fsp.sph -20051202_211824_646_fsp.sph -20051202_212105_647_fsp.sph -20051203_180701_649_fsp.sph -20051203_182100_650_fsp.sph -20051203_182132_651_fsp.sph -20051203_182418_652_fsp.sph -20051203_183501_653_fsp.sph -20051203_190503_654_fsp.sph -20051203_191125_655_fsp.sph -20051203_210216_656_fsp.sph -20051203_212114_658_fsp.sph -20051203_222533_661_fsp.sph -20051206_180753_662_fsp.sph -20051206_180911_663_fsp.sph -20051206_181649_664_fsp.sph -20051206_183057_665_fsp.sph -20051206_193937_667_fsp.sph -20051206_201757_668_fsp.sph -20051206_203158_669_fsp.sph -20051206_210127_670_fsp.sph -20051206_210744_671_fsp.sph -20051206_211522_672_fsp.sph -20051206_213252_673_fsp.sph -20051206_214122_674_fsp.sph -20051206_231328_675_fsp.sph -20051207_180507_676_fsp.sph -20051207_181020_677_fsp.sph -20051207_190155_678_fsp.sph -20051207_190426_679_fsp.sph -20051207_193103_681_fsp.sph -20051207_211858_683_fsp.sph -20051207_212300_684_fsp.sph -20051207_212831_685_fsp.sph -20051207_214411_686_fsp.sph -20051208_180208_687_fsp.sph -20051208_180810_688_fsp.sph -20051208_182430_689_fsp.sph -20051208_190333_690_fsp.sph -20051208_210609_691_fsp.sph -20051208_211702_692_fsp.sph -20051208_212444_694_fsp.sph -20051208_214100_696_fsp.sph -20051208_220606_697_fsp.sph -20051209_180824_699_fsp.sph -20051209_181542_700_fsp.sph -20051209_181642_701_fsp.sph -20051209_182541_702_fsp.sph -20051209_182858_703_fsp.sph -20051209_210136_704_fsp.sph -20051209_210452_705_fsp.sph -20051209_211542_706_fsp.sph -20051209_212515_707_fsp.sph -20051209_222427_709_fsp.sph -20051209_231702_710_fsp.sph -20051210_180659_711_fsp.sph -20051210_181201_712_fsp.sph -20051210_182013_713_fsp.sph -20051210_182603_714_fsp.sph -20051210_190201_715_fsp.sph -20051210_210535_717_fsp.sph -20051210_210735_718_fsp.sph -20051211_000414_719_fsp.sph -20051211_181346_720_fsp.sph -20051211_182045_721_fsp.sph -20051211_184252_723_fsp.sph -20051211_190523_724_fsp.sph -20051211_210240_725_fsp.sph -20051211_211415_726_fsp.sph -20051212_180251_727_fsp.sph -20051212_181817_728_fsp.sph -20051212_182453_729_fsp.sph -20051212_190335_730_fsp.sph -20051212_210527_731_fsp.sph -20051212_210738_732_fsp.sph -20051212_211419_733_fsp.sph -20051212_213447_734_fsp.sph -20051212_214512_735_fsp.sph -20051213_180254_736_fsp.sph -20051213_185913_737_fsp.sph -20051213_191741_738_fsp.sph -20051213_210120_739_fsp.sph -20051213_211552_741_fsp.sph -20051213_211953_742_fsp.sph -20051213_221424_743_fsp.sph -20051213_222016_744_fsp.sph -20051214_193942_746_fsp.sph -20051214_194606_747_fsp.sph -20051214_201000_748_fsp.sph -20051214_202717_749_fsp.sph -20051214_211653_750_fsp.sph -20051214_212318_751_fsp.sph -20051214_212718_752_fsp.sph -20051214_213225_753_fsp.sph -20051215_180855_754_fsp.sph -20051215_181731_755_fsp.sph -20051215_182213_756_fsp.sph -20051215_190143_757_fsp.sph -20051215_190419_758_fsp.sph -20051215_195526_759_fsp.sph -20051215_200925_760_fsp.sph -20051215_201639_761_fsp.sph -20051215_203848_762_fsp.sph -20051215_210410_764_fsp.sph -20051215_212456_766_fsp.sph -20051215_212701_767_fsp.sph -20051215_212749_768_fsp.sph -20051215_214814_769_fsp.sph -20051215_220537_770_fsp.sph -20051215_222306_771_fsp.sph -20051216_181042_773_fsp.sph -20051216_182340_774_fsp.sph -20051216_191101_775_fsp.sph -20051216_192823_776_fsp.sph -20051216_200153_777_fsp.sph -20051216_211423_778_fsp.sph -20051216_220626_779_fsp.sph -20051217_142547_780_fsp.sph -20051217_180231_781_fsp.sph -20051217_182026_783_fsp.sph -20051217_182330_784_fsp.sph -20051217_182530_785_fsp.sph -20051217_183115_786_fsp.sph -20051217_190226_787_fsp.sph -20051218_142845_790_fsp.sph -20051218_180353_791_fsp.sph -20051218_181751_792_fsp.sph -20051218_182127_793_fsp.sph -20051218_182750_794_fsp.sph -20051218_200401_799_fsp.sph -20051218_210249_800_fsp.sph -20051218_211820_801_fsp.sph -20051218_212444_802_fsp.sph -20051218_212813_803_fsp.sph -20051219_180225_804_fsp.sph -20051219_182110_806_fsp.sph -20051219_190625_808_fsp.sph -20051219_210655_812_fsp.sph -20051219_212218_813_fsp.sph -20051219_212716_814_fsp.sph -20051219_213203_815_fsp.sph -20051219_221213_816_fsp.sph -20051219_223123_817_fsp.sph -20051220_181731_820_fsp.sph -20051220_190121_821_fsp.sph -20051220_212400_826_fsp.sph -20051220_212718_828_fsp.sph -20051220_213420_829_fsp.sph -20051221_000417_830_fsp.sph -20051221_180958_831_fsp.sph -20051221_210452_840_fsp.sph -20051221_212325_841_fsp.sph -20051221_212911_842_fsp.sph -20051222_000436_843_fsp.sph -20051222_181242_845_fsp.sph -20051222_181506_846_fsp.sph -20051222_182617_847_fsp.sph -20051222_184209_849_fsp.sph -20051222_200553_850_fsp.sph -20051222_210309_852_fsp.sph -20051222_212425_855_fsp.sph -20051223_180346_856_fsp.sph -20051223_181050_857_fsp.sph -20051223_183105_860_fsp.sph -20051223_212547_863_fsp.sph -20051223_212853_864_fsp.sph -20051224_180302_865_fsp.sph -20051224_182949_867_fsp.sph -20051224_210150_870_fsp.sph -20051224_213010_871_fsp.sph -20051225_192042_872_fsp.sph -20051225_210556_873_fsp.sph -20051226_180908_874_fsp.sph -20051226_181659_875_fsp.sph -20051227_181058_885_fsp.sph -20051227_211308_887_fsp.sph -20051227_213029_888_fsp.sph -20051227_214843_889_fsp.sph -20051227_220309_890_fsp.sph -20051228_180249_891_fsp.sph -20051228_182051_892_fsp.sph -20051228_183955_893_fsp.sph -20051228_210524_896_fsp.sph -20051228_211808_897_fsp.sph -20051228_212304_899_fsp.sph -20051228_212734_900_fsp.sph -20051228_223227_901_fsp.sph -20051229_180231_902_fsp.sph -20051229_182614_906_fsp.sph -20051229_182631_907_fsp.sph -20051229_214024_909_fsp.sph -20051230_180457_910_fsp.sph -20051230_181721_912_fsp.sph -20051230_210412_913_fsp.sph -20051230_210559_914_fsp.sph -20051230_212557_915_fsp.sph -20051231_000808_916_fsp.sph -20060103_180314_917_fsp.sph -20060103_182107_918_fsp.sph -20060103_182257_919_fsp.sph -20060103_182549_920_fsp.sph -20060103_182654_921_fsp.sph -20060103_184037_922_fsp.sph -20060103_211504_925_fsp.sph -20060103_211732_926_fsp.sph -20060104_180509_928_fsp.sph -20060104_181040_929_fsp.sph -20060104_182115_930_fsp.sph -20060104_182644_931_fsp.sph -20060104_190448_933_fsp.sph -20060104_192707_934_fsp.sph -20060104_210223_935_fsp.sph -20060104_212844_936_fsp.sph -20060104_220148_937_fsp.sph -20060105_202127_943_fsp.sph -20060105_205957_944_fsp.sph -20060105_210951_945_fsp.sph -20060105_211743_946_fsp.sph -20060105_213129_947_fsp.sph -20060105_213243_948_fsp.sph -20060105_230711_949_fsp.sph -20060106_180202_950_fsp.sph -20060106_181040_951_fsp.sph -20060106_181726_952_fsp.sph -20060106_182909_953_fsp.sph -20060106_183056_954_fsp.sph -20060106_183550_955_fsp.sph -20060106_185224_956_fsp.sph -20060106_193129_957_fsp.sph -20060107_180634_960_fsp.sph -20060107_181553_961_fsp.sph -20060107_182715_962_fsp.sph -20060107_190206_963_fsp.sph -20060107_190415_964_fsp.sph -20060107_210435_966_fsp.sph -20060107_220739_967_fsp.sph -20060108_180630_968_fsp.sph -20060108_194731_971_fsp.sph -20060108_234917_976_fsp.sph -20060109_180448_977_fsp.sph -20060109_182557_979_fsp.sph -20060109_183636_980_fsp.sph -20060109_183727_981_fsp.sph -20060109_205815_982_fsp.sph -20060109_213409_986_fsp.sph -20060109_215138_987_fsp.sph -20060109_220315_988_fsp.sph -20060109_220535_989_fsp.sph -20060110_183405_995_fsp.sph -20060110_200611_998_fsp.sph -20060110_210730_1002_fsp.sph -20060110_213516_1004_fsp.sph -20060110_221920_1006_fsp.sph -20060110_230947_1007_fsp.sph -20060111_181650_1008_fsp.sph -20060111_182557_1009_fsp.sph -20060111_184916_1010_fsp.sph -20060111_192159_1012_fsp.sph -20060111_200345_1013_fsp.sph -20060111_210257_1014_fsp.sph -20060111_212145_1016_fsp.sph -20060111_213742_1017_fsp.sph -20060111_213936_1018_fsp.sph -20060111_230912_1020_fsp.sph -20060112_180639_1021_fsp.sph -20060112_182612_1022_fsp.sph -20060112_183346_1023_fsp.sph -20060112_183622_1024_fsp.sph -20060112_210747_1025_fsp.sph -20060112_211025_1026_fsp.sph -20060112_221010_1027_fsp.sph -20060112_221022_1028_fsp.sph -20060113_180159_1030_fsp.sph -20060113_183452_1033_fsp.sph -20060113_190403_1034_fsp.sph -20060113_213733_1036_fsp.sph -20060114_181137_1039_fsp.sph -20060114_181922_1040_fsp.sph -20060114_191056_1043_fsp.sph -20060114_213242_1044_fsp.sph -20060115_180421_1045_fsp.sph -20060115_183525_1047_fsp.sph -20060115_210217_1048_fsp.sph -20060115_212231_1051_fsp.sph -20060115_220504_1052_fsp.sph -20060115_232345_1053_fsp.sph -20060116_181908_1054_fsp.sph -20060116_182500_1055_fsp.sph -20060116_183201_1056_fsp.sph -20060116_184141_1057_fsp.sph -20060116_202324_1058_fsp.sph -20060116_204753_1059_fsp.sph -20060116_210217_1060_fsp.sph -20060116_211237_1061_fsp.sph -20060116_212845_1063_fsp.sph -20060116_220652_1064_fsp.sph -20060116_221118_1065_fsp.sph -20060117_181936_1068_fsp.sph -20060117_182604_1069_fsp.sph -20060117_185153_1071_fsp.sph -20060117_210138_1072_fsp.sph -20060117_210311_1073_fsp.sph -20060117_212546_1074_fsp.sph -20060118_180229_1076_fsp.sph -20060118_180647_1078_fsp.sph -20060118_182448_1079_fsp.sph -20060118_183010_1080_fsp.sph -20060118_190231_1082_fsp.sph -20060118_200148_1083_fsp.sph -20060118_205216_1084_fsp.sph -20060118_212907_1085_fsp.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test deleted file mode 100644 index 0cbc3cc95fd..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test +++ /dev/null @@ -1,20 +0,0 @@ -sp_0053.sph -sp_0082.sph -sp_0084.sph -sp_0088.sph -sp_0681.sph -sp_0699.sph -sp_0776.sph -sp_0857.sph -sp_1031.sph -sp_1100.sph -sp_1148.sph -sp_1156.sph -sp_1186.sph -sp_1212.sph -sp_1345.sph -sp_1435.sph -sp_1578.sph -sp_1648.sph -sp_1807.sph -sp_1847.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train deleted file mode 100644 index 2c936072534..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train +++ /dev/null @@ -1,80 +0,0 @@ -sp_0085.sph -sp_0096.sph -sp_0098.sph -sp_0100.sph -sp_0291.sph -sp_0713.sph -sp_0724.sph -sp_0726.sph -sp_0731.sph -sp_0733.sph -sp_0753.sph -sp_0788.sph -sp_0826.sph -sp_0831.sph -sp_0836.sph -sp_0841.sph -sp_0850.sph -sp_0855.sph -sp_0892.sph -sp_0899.sph -sp_0910.sph -sp_0917.sph -sp_0919.sph -sp_0923.sph -sp_0945.sph -sp_0950.sph -sp_0951.sph -sp_0992.sph -sp_0997.sph -sp_1013.sph -sp_1039.sph -sp_1044.sph -sp_1045.sph -sp_1058.sph -sp_1060.sph -sp_1063.sph -sp_1081.sph -sp_1106.sph -sp_1122.sph -sp_1140.sph -sp_1175.sph -sp_1195.sph -sp_1198.sph -sp_1231.sph -sp_1234.sph -sp_1255.sph -sp_1260.sph -sp_1261.sph -sp_1262.sph -sp_1264.sph -sp_1266.sph -sp_1273.sph -sp_1275.sph -sp_1284.sph -sp_1286.sph -sp_1304.sph -sp_1308.sph -sp_1333.sph -sp_1341.sph -sp_1353.sph -sp_1368.sph -sp_1379.sph -sp_1384.sph -sp_1449.sph -sp_1463.sph -sp_1574.sph -sp_1740.sph -sp_1759.sph -sp_1849.sph -sp_1908.sph -sp_1915.sph -sp_1918.sph -sp_1974.sph -sp_1976.sph -sp_1988.sph -sp_2000.sph -sp_2056.sph -sp_2070.sph -sp_2091.sph -sp_2101.sph diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl deleted file mode 100755 index 03193384670..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl +++ /dev/null @@ -1,304 +0,0 @@ -#!/usr/bin/env perl - -# Oct 21, 2015 : Gaurav Kumar (Johns Hopkins University) -# GNU General Public License, v3.0 -# -# This script was modified under GPL and is being distributed with -# Kaldi. It requires the preference and rule files -# (under LDC copyright) from LDC96L16. The main changes were -# - Outdated usage of perl conventions updated @_ => $_ or @A -# - This script no longer needs the preference and rule files to -# be in the same directory as this script. -# - Accepts tokens from instead of <> - -# --- Retained previous version information ---------------------------- -# spron.pl Version 0.1 Jan. 11 1995 -# Written by Zhibiao Wu, LDC, wzb@unagi.cis.upenn.edu -# This program needs the basic_rules file to run. The rules must be sorted -# in alphabetical order. The most specific rules should precede the more -# general ones. The conventions used in the basic rules are the same as -# regular expressions used in Perl. - -# Revised history: Feb. 10 1995 - -# The file "preferences" (assumed to be in your current directory) -# gives an "oracle" of correct pronunciations that override the -# machine-generated ones. - -# slightly changed 97/09/05 robertm: -# - look for basic_rules and preferences in $PWD instead of ~wzb/... -# - use next to shortcut loop instead of if/else -# - added a bit of documentation, without really trying to decipher this thing -# ----------------------------------------------------------------------- - -use utf8; -binmode(STDIN, ":utf8"); -binmode(STDOUT, ":utf8"); - -$vfile = ""; -$preference_file = ""; -$rules_file = ""; -$print_input = 0; -if ($#ARGV < 1) { - # Print Usage - print "Usage : local/spron.pl pref-file rules-file \n"; - exit 1; -} else { - $preference_file = $ARGV[0]; - $rules_file = $ARGV[1]; - if ($#ARGV > 1) { - $vfile = $ARGV[2]; - } - if ($#ARGV > 2) { - $print_input = 1; - } -} - -$rule_num = 0; -$previous = ""; -if ($vfile ne "") { - open(VF, $vfile) || die "Can't find file $vfile!\n"; - while () { - chop; - @A = split(//); - if (($A[0] ne '#') && ($_ ne "")) { - if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) { - $head[$rule_num] = $1; - $end[$rule_num] = $2; - $pre[$rule_num] = $3; - if ($4 =~ /#/) { - $nex[$rule_num] = ""; - $some[$rule_num] = $4; - } else { - $nex[$rule_num] = $4; - $some[$rule_num] = $5; - } - if ($previous ne substr($head[$rule_num],0,1)) { - $first{$head[$rule_num]} = $rule_num; - $last{$previous} = $rule_num - 1; - } - $previous = substr($head[$rule_num++],0,1); - } else { - print "Rule format error: Cannot parse $_\n"; - exit(1); - } - } - } - $last{$previous} = $rule_num - 1; - - close(VF); -} - -open(PF, $preference_file) || die "Can't read `preferences' file"; -binmode(PF, ":iso88591"); -while () { - chop; - if ($_ ne "") { - @A = split; - $pron{$A[0]} = $A[1]; - $stre{$A[0]} = $A[2]; - } -} - -$previous = ""; -$brule_num = 0; -open(BF, $rules_file) || die "Can't read `basic_rules' file"; -binmode(BF, ":iso88591"); -while () { - chop; - @A = split(//); - if (($A[0] ne '#') && ($_ ne "")) { - if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) { - $bhead[$brule_num] = $1; - $bend[$brule_num] = $2; - $bpre[$brule_num] = $3; - if ($4 =~ /#/) { - $bnex[$brule_num] = ""; - $bsome[$brule_num] = $4; - } else { - $bnex[$brule_num] = $4; - $bsome[$brule_num] = $5; - } - if ($previous ne substr($bhead[$brule_num],0,1)) { - $bfirst{substr($bhead[$brule_num],0,1)} = $brule_num; - $blast{$previous} = $brule_num - 1; - } - $previous = substr($bhead[$brule_num++],0,1); - } else { - print "Rule format error in file basic_rules: Cannot parse $_\n"; - exit(1); - } - } -} -$blast{$previous} = $brule_num - 1; -close(BF); - -if ($brule_num == 0) { - print "No basic rules, Program exit!\n"; - exit(1); -} - -while(){ - next if ((/^#/) || (/^\s*$/) ); - chop; - if ($print_input) { - print $_, "\t"; - } - if ($pron{$_}) { - # print answer from preferences and skip to next word - print "$pron{$_}\t$stre{$_}\n"; - next; - } - $original = $_; - tr/A-ZÁÉÍÓÚÏÜÑ/a-záéíóúïüñ/; - $orig = "#" . $_ . "#"; - - @l = (); - - push(@l,split("",$orig)); - - @pron = &transfer(1); - - foreach (@pron) { - $a = $_; - y/aeiouáéíóú//cd; - if ($_ eq "") { - print "#No stressable vowel in $original\n"; - } else { - s/[aeiou]/0/go; - s/[áéíóú]/1/go; - if (!/1/) { - if(length() == 1){ - s/\b./1/o; - } elsif($l[$#l - 1] =~ /[aeiouns]/o){ - s/00\b/10/o; - } else { - s/0\b/1/o; - } - } - - $a =~ s/á/a/g; - $a =~ s/é/e/g; - $a =~ s/í/i/g; - $a =~ s/ó/o/g; - $a =~ s/ú/u/g; - - print "$a\t$_\n"; - } - } -} - -sub transfer{ - local($_) = @_; - local(@p) = (); - local($s) = 0; - local($over) = 0; - local($i,$j,$k) = (0,0,0); - - if ($_ >= length($orig) - 1) { - push(@p, ""); - return(@p); - } else { - - if ($vfile ne "") { - for ($i= $first{substr($orig, $_, 1)}; - $i <= $last{substr($orig, $_, 1)} ; $i++) { - if (&matchv($_,$i)) { - $s = $_ + length($head[$i]); - foreach $w (&transfer($s)) { - push(@p, $end[$i] . $w); - if ($some[$i] ne "") { - $over = 0; - } else { - $over = 1; - } - } - } - } - } - - if ($over == 0 ) { - $i = $bfirst{substr($orig, $_, 1)}; - while (($i <= $blast{substr($orig, $_, 1)}) && ($over == 0)) { - if (&matchb($_,$i)) { - $over = 1; - $s = $_ + length($bhead[$i]); - foreach $w (&transfer($s)) { - push(@p, $bend[$i] . $w); - } - } - $i++; - } - if ($over == 0) { - $s = $_ + 1; - foreach $w (&transfer($s)) { - push(@p, substr($orig,$_,1) . $w); - } - } - } - - return(@p); - } -} - -sub matchv { - $h = $head[$_[1]]; - $p = $pre[$_[1]]; - $n = $nex[$_[1]]; - - return(&match($_[0],$h,$p,$n)); - -} - -sub matchb { - $h = $bhead[$_[1]]; - $p = $bpre[$_[1]]; - $n = $bnex[$_[1]]; - - return(&match($_[0],$h,$p,$n)); - -} - -sub match { - - if (substr($orig, $_[0], length($_[1])) eq $_[1]) { - return ( &match_n($_[0] + length($_[1]) - 1, $_[3]) && - &match_p($_[0], $_[2])); - } else { - return (0); - } -} - -sub match_p { - local($a) = $_[0]; - local($b) = $_[1]; - local($_); - - if ($b eq "" ) { - return (1); - } else { - $_ = substr($orig, 0, $a) . "!"; - if (/($b)!/) { - return(1); - } else { - return(0); - } - } -} - -sub match_n { - local($a) = $_[0]; - local($b) = $_[1]; - local($_); - - if ($b eq "" ) { - return (1); - } else { - $_ = "!" . substr($orig, $a + 1, length($orig) - $a - 1); - if (/!($b)/) { - return(1); - } else { - return(0); - } - } -} diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh deleted file mode 100755 index 9f5855d56c4..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh +++ /dev/null @@ -1,164 +0,0 @@ -#!/bin/bash -# -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# The input is a subset of the dataset in use. (*.sph files) -# In addition the transcripts are needed as well. -# This script is only called internally and should not be -# used for any other purpose. A similar script for general usage -# is local/fsp_data_prep.sh -# To be run from one directory above this script. - -stage=0 - -export LC_ALL=C - - -if [ $# -lt 4 ]; then - echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories and the name of this partition -, and a list of files that belong to this partition . see ../run.sh for example." - exit 1; -fi - -subset=$3 -dir=`pwd`/data/local/$subset/data -mkdir -p $dir -local=`pwd`/local -utils=`pwd`/utils -tmpdir=`pwd`/data/local/tmp -mkdir -p $tmpdir - -. ./path.sh || exit 1; # Needed for KALDI_ROOT -export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -if [ ! -x $sph2pipe ]; then - echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; - exit 1; -fi -cd $dir - -# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command -# line arguments being absolute pathnames. -rm -r links/ 2>/dev/null -mkdir links/ -mkdir links/speech -mkdir links/transcripts -if [ ! -f $4 ]; then - echo "Please specify a valid parition file. Could not find $4" - exit 1; -fi -cat $4 | sed 's:.*/::g' | \ -xargs -I % find $1/ -name %* | xargs -I % echo cp % links/ - -# Basic spot checks to see if we got the data that we needed -if [ ! -d links/LDC2010S01 -o ! -d links/LDC2010T04 ]; -then - echo "The speech and the data directories need to be named LDC2010S01 and LDC2010T04 respecti -vely" - exit 1; -fi - -if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ]; -then - echo "Disc 1 and 2 directories missing or not properly organised within the speech data dir" - echo "Typical format is LDC2010S01/DISC?/data/speech" - exit 1; -fi - -#Check the transcripts directories as well to see if they exist -if [ ! -d links/LDC2010T04/data/transcripts ]; -then - echo "Transcript directories missing or not properly organised" - echo "Typical format is LDC2010T04/data/transcripts" - exit 1; -fi - -speech_d1=$dir/links/LDC2010S01/DISC1/data/speech -speech_d2=$dir/links/LDC2010S01/DISC2/data/speech -transcripts=$dir/links/LDC2010T04/data/transcripts - -fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` -fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` -fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` -#TODO:it seems like not all speech files have transcripts -#Now check if we got all the files that we needed -if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; -then - echo "Incorrect number of files in the data directories" - echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively" - echo "The transcripts should contain 819 files" - exit 1; -fi - -if [ $stage -le 0 ]; then - #Gather all the speech files together to create a file list - #TODO: Train and test split might be required - ( - find $speech_d1 -iname '*.sph'; - find $speech_d2 -iname '*.sph'; - ) > $tmpdir/train_sph.flist - - #Get all the transcripts in one place - find $transcripts -iname '*.tdf' > $tmpdir/train_transcripts.flist -fi - -if [ $stage -le 1 ]; then - $local/fsp_make_trans.pl $tmpdir - mkdir -p $dir/train_all - mv $tmpdir/reco2file_and_channel $dir/train_all/ -fi - -if [ $stage -le 2 ]; then - sort $tmpdir/text.1 | grep -v '((' | \ - awk '{if (NF > 1){ print; }}' | \ - sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \ - sed 's:\([^<]*\)<\/lname>:\1:g' | \ - sed 's:::g' | \ - sed 's:[^<]*<\/laugh>:[laughter]:g' | \ - sed 's:<\s*cough[\/]*>:[noise]:g' | \ - sed 's::[noise]:g' | \ - sed 's::[noise]:g' | \ - sed 's::[noise]:g' | \ - sed 's:[^<]*<\/background>:[noise]:g' | \ - sed -r 's:<[/]?background[/]?>:[noise]:g' | \ - #One more time to take care of nested stuff - sed 's:[^<]*<\/laugh>:[laughter]:g' | \ - sed -r 's:<[/]?laugh[/]?>:[laughter]:g' | \ - #now handle the exceptions, find a cleaner way to do this? - sed 's:::g' | \ - sed 's:::g' | \ - sed 's:foreign>::g' | \ - sed 's:>::g' | \ - #How do you handle numbers? - grep -v '()' | \ - #Now go after the non-printable characters - sed -r 's:¿::g' > $tmpdir/text.2 - cp $tmpdir/text.2 $dir/train_all/text - - #Create segments file and utt2spk file - ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \ - && echo "Error producing utt2spk file" && exit 1; - - cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; - $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' >$dir/train_all/segments - - $utils/utt2spk_to_spk2utt.pl <$dir/train_all/utt2spk > $dir/train_all/spk2utt -fi - -if [ $stage -le 3 ]; then - cat $tmpdir/train_sph.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp - cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \ - sort -k1,1 -u > $dir/train_all/wav.scp || exit 1; -fi - -if [ $stage -le 4 ]; then - # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl. - cat $tmpdir/spk2gendertmp | sort | uniq > $dir/train_all/spk2gender -fi - -echo "Fisher Spanish Data preparation succeeded." - -exit 1; - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py deleted file mode 100755 index ce83fa8c8aa..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -import os -import sys - -files = [ -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/exp/tri5a/decode_test/scoring/13.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/exp/tri5a/decode_test/scoring/13.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/exp/tri5a/decode_test/scoring/13.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/exp/tri5a/decode_test/scoring/13.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/exp/tri5a/decode_test/scoring/13.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/exp/tri5a/decode_test/scoring/13.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/exp/tri5a/decode_test/scoring/13.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/exp/tri5a/decode_test/scoring/13.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/exp/tri5a/decode_test/scoring/13.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/exp/tri5a/decode_test/scoring/13.tra')] - -def findTranscription(timeDetail): - - for file1 in files: - file1.seek(0,0) - for line in file1: - lineComp = line.split() - if lineComp[0] == timeDetail: - return " ".join(lineComp[1:]) - # No result found - return -1 - - -wordsFile = open('exp/tri5a/graph/words.txt') -words = {} - -# Extract word list -for line in wordsFile: - lineComp = line.split() - words[int(lineComp[1])] = lineComp[0].strip() - -# Now read list of files in conversations -fileList = [] -#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev') -conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train') -for line in conversationList: - line = line.strip() - line = line[:-4] - fileList.append(line) - -# IN what order were the conversations added to the spanish files? -# TODO: Make sure they match the order in which these english files are being written - -# Now get timing information to concatenate the ASR outputs -if not os.path.exists('exp/tri5a/one-best/train'): - os.makedirs('exp/tri5a/one-best/train') - -#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+') -provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train', 'w+') -for item in fileList: - timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es') - newFile = open('exp/tri5a/one-best/train/' + item + '.es', 'w+') - for line in timingFile: - timeInfo = line.split() - mergedTranslation = "" - for timeDetail in timeInfo: - #Locate this in ASR dev/test, this is going to be very slow - tmp = findTranscription(timeDetail) - if tmp != -1: - mergedTranslation = mergedTranslation + " " + tmp - mergedTranslation = mergedTranslation.strip() - transWords = [words[int(x)] for x in mergedTranslation.split()] - newFile.write(" ".join(transWords) + "\n") - provFile.write(" ".join(transWords) + "\n") - newFile.close() -provFile.close() - - - - - - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py deleted file mode 100755 index b9f906b27da..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -from __future__ import print_function -import os -import sys -import subprocess - -latticeLocation = {1:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/latjosh-2/lattices-pushed/", -2:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/latjosh-2/lattices-pushed/", -3:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/latjosh-2/lattices-pushed/", -4:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/latjosh-2/lattices-pushed/", -5:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/latjosh-2/lattices-pushed/", -6:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/latjosh-2/lattices-pushed/", -7:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/latjosh-2/lattices-pushed/", -8:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/latjosh-2/lattices-pushed/", -9:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/latjosh-2/lattices-pushed/", -10:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/latjosh-2/lattices-pushed/"} - -latticeDict = {} - -for key,location in latticeLocation.items(): - for root, dirs, filenames in os.walk(location): - for f in filenames: - latticeDict[f] = str(key) - -tmpdir = 'data/local/data/tmp/lattmp' -if not os.path.exists(tmpdir): - os.makedirs(tmpdir) -invalidplfdir = 'data/local/data/tmp/invalidplf' -if not os.path.exists(invalidplfdir): - os.makedirs(invalidplfdir) -else: - os.system("rm " + invalidplfdir + "/*") - -def latticeConcatenate(lat1, lat2): - ''' - Concatenates lattices, writes temporary results to tmpdir - ''' - if lat1 == "": - if os.path.exists('rm ' + tmpdir + '/tmp.lat'): - os.system('rm ' + tmpdir + '/tmp.lat') - return lat2 - else: - proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')]) - proc.wait() - return tmpdir + '/tmp.lat' - - -def findLattice(timeDetail): - ''' - Finds the lattice corresponding to a time segment - ''' - searchKey = timeDetail + '.lat' - if searchKey in latticeDict: - return "/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-" + latticeDict[searchKey] + "/latjosh-2/lattices-pushed/" + searchKey - else: - return -1 - - -# Now read list of files in conversations -fileList = [] -conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train') -for line in conversationList: - line = line.strip() - line = line[:-4] - fileList.append(line) - -# IN what order were the conversations added to the spanish files? -# Now get timing information to concatenate the ASR outputs - -provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train.plf', 'w+') -lineNo = 1 -invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/invalidPLF', 'w+') -blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/blankPLF', 'w+') -rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/removeLines', 'w+') -for item in fileList: - timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es') - for line in timingFile: - timeInfo = line.split() - - # For utterances that are concatenated in the translation file, - # the corresponding FSTs have to be translated as well - mergedTranslation = "" - for timeDetail in timeInfo: - tmp = findLattice(timeDetail) - if tmp != -1: - # Concatenate lattices - mergedTranslation = latticeConcatenate(mergedTranslation, tmp) - - if mergedTranslation != "": - - # Sanjeev's Recipe : Remove epsilons and topo sort - finalFST = tmpdir + "/final.fst" - os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST) - - # Now convert to PLF - proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh /export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt ' + finalFST, stdout=subprocess.PIPE, shell=True) - PLFline = proc.stdout.readline() - finalPLFFile = tmpdir + "/final.plf" - finalPLF = open(finalPLFFile, "w+") - finalPLF.write(PLFline) - finalPLF.close() - - # now check if this is a valid PLF, if not write it's ID in a - # file so it can be checked later - proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) - line = proc.stdout.readline() - print("{} {}".format(line, lineNo)) - if line.strip() != "PLF format appears to be correct.": - os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) - invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n") - rmLines.write("{}\n".format(lineNo)) - else: - provFile.write(PLFline) - else: - blankPLF.write(timeInfo[0] + "\n") - rmLines.write("{}\n".format(lineNo)) - # Now convert to PLF - lineNo += 1 - -provFile.close() -invalidPLF.close() -blankPLF.close() -rmLines.close() diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh deleted file mode 100755 index b8b3ca35ef9..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -stage=-2 -num_words_pocolm=110000 -prune_size=1000000 - -. ./path.sh -. ./cmd.sh -. ./utils/parse_options.sh - -set -euo pipefail - -export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P) -export PATH=$PATH:$POCOLM_ROOT/scripts - -textdir=$1 -pocolm_dir=$2 - - -if [ $stage -le -2 ]; then - echo "****" - echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model" - echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....." - echo "****" - if [ -e "$textdir"/unigram_weights ]; then - rm "$textdir"/unigram_weights - fi - if [ -e "$pocolm_dir" ]; then - rm -r "$pocolm_dir" - fi - - bash local/pocolm_cust.sh --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \ - --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" - -fi - -if [ $stage -le -1 ];then - echo "********" - echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....." - echo "********" - - echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done - python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights - bash local/pocolm_cust.sh --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \ - --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir" - prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \ - "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" - mkdir -p "$pocolm_dir"/arpa - format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size" | \ - gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz -fi - - -exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py deleted file mode 100755 index 3f6444da294..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python -# Copyright 2014 Gaurav Kumar. Apache 2.0 - -import os -import sys - -files = [ -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/exp/tri5a/decode_test/oracle/oracle.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/exp/tri5a/decode_test/oracle/oracle.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/exp/tri5a/decode_test/oracle/oracle.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/exp/tri5a/decode_test/oracle/oracle.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/exp/tri5a/decode_test/oracle/oracle.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/exp/tri5a/decode_test/oracle/oracle.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/exp/tri5a/decode_test/oracle/oracle.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/exp/tri5a/decode_test/oracle/oracle.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/exp/tri5a/decode_test/oracle/oracle.tra'), -open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/exp/tri5a/decode_test/oracle/oracle.tra')] - -def findTranscription(timeDetail): - - for file1 in files: - file1.seek(0,0) - for line in file1: - lineComp = line.split() - if lineComp[0] == timeDetail: - return " ".join(lineComp[1:]) - # No result found - return -1 - - -wordsFile = open('exp/tri5a/graph/words.txt') -words = {} - -# Extract word list -for line in wordsFile: - lineComp = line.split() - words[int(lineComp[1])] = lineComp[0].strip() - -# Now read list of files in conversations -fileList = [] -#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev') -conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train') -for line in conversationList: - line = line.strip() - line = line[:-4] - fileList.append(line) - -# IN what order were the conversations added to the spanish files? -# TODO: Make sure they match the order in which these english files are being written - -# Now get timing information to concatenate the ASR outputs -if not os.path.exists('exp/tri5a/one-best/train'): - os.makedirs('exp/tri5a/one-best/train') - -#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+') -provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train.oracle', 'w+') -for item in fileList: - timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es') - newFile = open('exp/tri5a/one-best/train/' + item + '.es', 'w+') - for line in timingFile: - timeInfo = line.split() - mergedTranslation = "" - for timeDetail in timeInfo: - #Locate this in ASR dev/test, this is going to be very slow - tmp = findTranscription(timeDetail) - if tmp != -1: - mergedTranslation = mergedTranslation + " " + tmp - mergedTranslation = mergedTranslation.strip() - transWords = [words[int(x)] for x in mergedTranslation.split()] - newFile.write(" ".join(transWords) + "\n") - provFile.write(" ".join(transWords) + "\n") - newFile.close() -provFile.close() - - - - - - diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter b/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter deleted file mode 100755 index 4fce42945b3..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sed -f -s:\[laughter\]::g -s:\[noise\]::g -s:\[oov\]::g -s:::g diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh deleted file mode 100755 index 2993311fd90..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -export KALDI_ROOT=`pwd`/../../../ -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH -[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 -. $KALDI_ROOT/tools/config/common_path.sh -export LD_LIBRARY_PATH=/home/dpovey/libs - -export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk -export PATH=$SPARROWHAWK_ROOT/bin:$PATH -export LC_ALL=C -export LANG=C - -source ~/anaconda/bin/activate py36 diff --git a/egs/fisher_callhome_spanish/s5_gigaword/rnnlm b/egs/fisher_callhome_spanish/s5_gigaword/rnnlm deleted file mode 120000 index fb754622d5e..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/rnnlm +++ /dev/null @@ -1 +0,0 @@ -../../wsj/s5/rnnlm \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh deleted file mode 100755 index 95425c29034..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh +++ /dev/null @@ -1,310 +0,0 @@ -#!/bin/bash -# -# Copyright 2018 Nagendra Goel, Saikiran Valluri Apache 2.0 -# Copyright 2014 Gaurav Kumar. Apache 2.0 -# Recipe for Fisher/Callhome-Spanish - -stage=-1 -lmstage=-2 -train_rnnlm=false -start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder. - # If you already have the normalised gigword text somewhere, you can bypass the - # time consuming text cleanup (~1 week) by setting this option false. -addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to - # perform the A, A + G, Dev type POCOLM training configuration. - # A=fsp train, G=gigword text, -num_words_pocolm=110000 -train_sgmm2=false - -# call the next line with the directory where the Spanish Fisher data is -# (the values below are just an example). -sfisher_speech=/export/corpora/LDC/LDC2010S01 -sfisher_transcripts=/export/c03/svalluri//LDC2010T04 -spanish_lexicon=/export/corpora/LDC/LDC96L16 -split=local/splits/split_fisher - -callhome_speech=/export/corpora/LDC/LDC96S35 -callhome_transcripts=/export/corpora/LDC/LDC96T17 -split_callhome=local/splits/split_callhome - -gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data -rnnlm_workdir=workdir_rnnlm_Spanish_08032019 -mfccdir=`pwd`/mfcc - -. ./cmd.sh -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -set -eou pipefail - -if [ $stage -le -1 ]; then - local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts - - local/callhome_data_prep.sh $callhome_speech $callhome_transcripts - - # The lexicon is created using the LDC spanish lexicon, the words from the - # fisher spanish corpus. Additional (most frequent) words are added from the - # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted - # wordlist is downloaded if it is not available. - local/fsp_prepare_dict.sh $spanish_lexicon - # Let's keep the original dict copy for G2P training - cp -r data/local/dict data/local/dict_orig - ( - steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error - ) & - - # Added c,j, v to the non silences phones manually - utils/prepare_lang.sh data/local/dict_orig "" data/local/lang_orig data/lang_orig - - utils/fix_data_dir.sh data/local/data/train_all - - steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1; - - utils/fix_data_dir.sh data/local/data/train_all - utils/validate_data_dir.sh data/local/data/train_all - - cp -r data/local/data/train_all data/train_all - - # For the CALLHOME corpus - utils/fix_data_dir.sh data/local/data/callhome_train_all - - steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/callhome_train_all exp/make_mfcc/callhome_train_all $mfccdir || exit 1; - - utils/fix_data_dir.sh data/local/data/callhome_train_all - utils/validate_data_dir.sh data/local/data/callhome_train_all - - cp -r data/local/data/callhome_train_all data/callhome_train_all - - local/create_splits.sh $split - local/callhome_create_splits.sh $split_callhome - -fi - -if $start_textcleanup; then - echo "WARNING : Starting from cleaning up and normalizing the Gigword text" - echo " This might take few days........... You can opt out this stage " - echo " by setting start_textcleanup=false, and having text_lm ready inside rnnlm_workdir." - - if [ $stage -le 0 ]; then - mkdir -p "$rnnlm_workdir"/gigaword_rawtext - local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath" "$rnnlm_workdir"/flattened_gigaword_corpus 24 - cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt - local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/ \ - "$rnnlm_workdir"/normalised_gigaword_corpus/ - mkdir -p "$rnnlm_workdir"/text_lm - cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt - cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt # For RNNLM and POCOLM training we use dev2/text as dev file. - cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt - if $addtraintext; then - cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt - fi - fi -fi - -if [ $stage -le 1 ]; then - local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm - local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \ - "$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords - if $train_rnnlm; then - local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \ - --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm - fi -fi - - -if [ $stage -le 2 ]; then - wait # wait till G2P training finishes - if [ -f exp/g2p/.error ]; then - rm exp/g2p/.error || true - echo "Fail to train the G2P model." && exit 1; - fi - steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex - cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^$/d" |sort | uniq > "$rnnlm_workdir"/lexicon_extended.txt - cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version. - - utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang - - # Make sure that you do not use your test and your dev sets to train the LM - # Some form of cross validation is possible where you decode your dev/set based on an - # LM that is trained on everything but that that conversation - # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl - # to get the numbers. Depending on your needs, you might have to change the size of - # the splits within that file. The default paritions are based on the Kaldi + Joshua - # requirements which means that I have very large dev and test sets - local/fsp_train_lms.sh $split - local/fsp_create_test_lang.sh - - # Now compute CMVN stats for the train, dev and test subsets - steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir - steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir - steps/compute_cmvn_stats.sh data/dev2 exp/make_mfcc/dev2 $mfccdir - #steps/compute_cmvn_stats.sh data/mt_train exp/make_mfcc/mt_train $mfccdir - #steps/compute_cmvn_stats.sh data/mt_test exp/make_mfcc/mt_test $mfccdir - - #n=$[`cat data/train_all/segments | wc -l` - 158126] - #utils/subset_data_dir.sh --last data/train_all $n data/train - steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir - - steps/compute_cmvn_stats.sh data/callhome_dev exp/make_mfcc/callhome_dev $mfccdir - steps/compute_cmvn_stats.sh data/callhome_test exp/make_mfcc/callhome_test $mfccdir - steps/compute_cmvn_stats.sh data/callhome_train exp/make_mfcc/callhome_train $mfccdir - - # Again from Dan's recipe : Reduced monophone training data - # Now-- there are 1.6 million utterances, and we want to start the monophone training - # on relatively short utterances (easier to align), but not only the very shortest - # ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random - # utterances from those. - - utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort - utils/subset_data_dir.sh data/train_100kshort 10000 data/train_10k - utils/data/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup - utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k - utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k -fi - -if [ $stage -le 3 ]; then - steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ - data/train_10k_nodup data/lang exp/mono0a - - steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1; - - steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1; - - - (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri1/graph data/dev exp/tri1/decode_dev)& - - steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1; - - steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1; - - ( - utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1; - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1; - )& -fi - -if [ $stage -le 4 ]; then - steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1; - -# Train tri3a, which is LDA+MLLT, on 100k data. - steps/train_lda_mllt.sh --cmd "$train_cmd" \ - --splice-opts "--left-context=3 --right-context=3" \ - 3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1; - ( - utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; - steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; - )& -fi - -if [ $stage -le 5 ]; then -# Next we'll use fMLLR and train with SAT (i.e. on -# fMLLR features) - steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1; - - steps/train_sat.sh --cmd "$train_cmd" \ - 4000 60000 data/train_100k data/lang exp/tri3a_ali exp/tri4a || exit 1; - - ( - utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri4a/graph data/dev exp/tri4a/decode_dev -)& - - - steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train data/lang exp/tri4a exp/tri4a_ali || exit 1; - -# Reduce the number of gaussians - steps/train_sat.sh --cmd "$train_cmd" \ - 5000 120000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; - - ( - utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/dev exp/tri5a/decode_dev - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/test exp/tri5a/decode_test - - # Decode CALLHOME - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev - steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train - ) & - - - steps/align_fmllr.sh \ - --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \ - data/train data/lang exp/tri5a exp/tri5a_ali -fi - -if $train_sgmm2; then - -steps/train_ubm.sh \ - --cmd "$train_cmd" 750 \ - data/train data/lang exp/tri5a_ali exp/ubm5 - -steps/train_sgmm2.sh \ - --cmd "$train_cmd" 5000 18000 \ - data/train data/lang exp/tri5a_ali exp/ubm5/final.ubm exp/sgmm5 - -utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph - -( - steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \ - --config conf/decode.config --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \ - exp/sgmm5/graph data/dev exp/sgmm5/decode_dev -)& - -steps/align_sgmm2.sh \ - --nj 32 --cmd "$train_cmd" --transform-dir exp/tri5a_ali \ - --use-graphs true --use-gselect true \ - data/train data/lang exp/sgmm5 exp/sgmm5_ali - -steps/make_denlats_sgmm2.sh \ - --nj 32 --sub-split 32 --num-threads 4 \ - --beam 10.0 --lattice-beam 6 --cmd "$decode_cmd" --transform-dir exp/tri5a_ali \ - data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats - -steps/train_mmi_sgmm2.sh \ - --cmd "$train_cmd" --drop-frames true --transform-dir exp/tri5a_ali --boost 0.1 \ - data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \ - exp/sgmm5_mmi_b0.1 - -( -utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph -steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \ - --config conf/decode.config --scoring-opts "--min-lmwt 8 --max-lmwt 12"\ - exp/tri5a/graph data/dev exp/tri5a/decode_dev -utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph -steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \ - --config conf/decode.config --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \ - exp/sgmm5/graph data/dev exp/sgmm5/decode_dev -for iter in 1 2 3 4; do - decode=exp/sgmm5_mmi_b0.1/decode_dev_it$iter - mkdir -p $decode - steps/decode_sgmm2_rescore.sh \ - --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5a/decode_dev \ - data/lang_test data/dev/ exp/sgmm5/decode_dev $decode -done -) & -fi - -wait; - -if [ $stage -le 6 ]; then - local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1; -fi -exit 0; diff --git a/egs/fisher_callhome_spanish/s5_gigaword/steps b/egs/fisher_callhome_spanish/s5_gigaword/steps deleted file mode 120000 index 1b186770dd1..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/steps +++ /dev/null @@ -1 +0,0 @@ -../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5_gigaword/utils b/egs/fisher_callhome_spanish/s5_gigaword/utils deleted file mode 120000 index a3279dc8679..00000000000 --- a/egs/fisher_callhome_spanish/s5_gigaword/utils +++ /dev/null @@ -1 +0,0 @@ -../../wsj/s5/utils/ \ No newline at end of file From f810119b7a0f93f9aa3b3d2d387cd113248fafa1 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Tue, 2 Apr 2019 10:48:01 -0400 Subject: [PATCH 34/49] Small cleanup for scripts format --- egs/fisher_callhome_spanish/s5/cmd.sh | 4 ++-- egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh | 6 +++--- egs/fisher_callhome_spanish/s5/steps | 2 +- egs/fisher_callhome_spanish/s5/utils | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh index db97f1fbc6f..88db78823a5 100755 --- a/egs/fisher_callhome_spanish/s5/cmd.sh +++ b/egs/fisher_callhome_spanish/s5/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd="retry.pl queue.pl --mem 8G" -export decode_cmd="retry.pl queue.pl --mem 8G" +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh index 2f478419a18..9e9e6efe7df 100755 --- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh +++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh @@ -30,7 +30,7 @@ reporting_email= gigaword_workdir= # LSTM/chain options -train_stage=-20 +train_stage=-10 xent_regularize=0.1 dropout_schedule='0,0@0.20,0.3@0.50,0' @@ -157,7 +157,7 @@ if [ $stage -le 19 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" @@ -202,7 +202,7 @@ fi if [ $stage -le 20 ]; then - if [[ $(hostname -f) == *.clsp.joujhu.edu ]] && [ ! -d $dir/egs/storage ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi diff --git a/egs/fisher_callhome_spanish/s5/steps b/egs/fisher_callhome_spanish/s5/steps index 1b186770dd1..6e99bf5b5ad 120000 --- a/egs/fisher_callhome_spanish/s5/steps +++ b/egs/fisher_callhome_spanish/s5/steps @@ -1 +1 @@ -../../wsj/s5/steps/ \ No newline at end of file +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/fisher_callhome_spanish/s5/utils b/egs/fisher_callhome_spanish/s5/utils index a3279dc8679..b240885218f 120000 --- a/egs/fisher_callhome_spanish/s5/utils +++ b/egs/fisher_callhome_spanish/s5/utils @@ -1 +1 @@ -../../wsj/s5/utils/ \ No newline at end of file +../../wsj/s5/utils \ No newline at end of file From dc8a56e5bacbfbbee7573f00bbceed78398858c4 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Fri, 5 Apr 2019 06:57:03 -0400 Subject: [PATCH 35/49] Cosmetic fix --- egs/fisher_callhome_spanish/s5/run.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 95425c29034..17ef6313e5e 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -80,17 +80,18 @@ if [ $stage -le -1 ]; then fi -if $start_textcleanup; then - echo "WARNING : Starting from cleaning up and normalizing the Gigword text" - echo " This might take few days........... You can opt out this stage " - echo " by setting start_textcleanup=false, and having text_lm ready inside rnnlm_workdir." - - if [ $stage -le 0 ]; then +if [ $stage -le 0 ]; then + if $start_textcleanup; then + echo "WARNING : Starting from cleaning up and normalizing the Gigword text" + echo " This might take few days........... You can skip out this stage " + echo " by setting start_textcleanup=false, and having normalised_gigaword_corpus/text_normalized ready inside $rnnlm_workdir." + mkdir -p "$rnnlm_workdir"/gigaword_rawtext local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath" "$rnnlm_workdir"/flattened_gigaword_corpus 24 cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/ \ "$rnnlm_workdir"/normalised_gigaword_corpus/ + fi mkdir -p "$rnnlm_workdir"/text_lm cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt # For RNNLM and POCOLM training we use dev2/text as dev file. @@ -98,7 +99,6 @@ if $start_textcleanup; then if $addtraintext; then cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt fi - fi fi if [ $stage -le 1 ]; then From 8b8222e58dd8b5814ec29b4550b42cf613389372 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Thu, 18 Apr 2019 11:28:58 -0400 Subject: [PATCH 36/49] Remove virtenv dependency --- egs/fisher_callhome_spanish/s5/path.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh index 2993311fd90..c4b93124d7c 100755 --- a/egs/fisher_callhome_spanish/s5/path.sh +++ b/egs/fisher_callhome_spanish/s5/path.sh @@ -9,5 +9,3 @@ export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk export PATH=$SPARROWHAWK_ROOT/bin:$PATH export LC_ALL=C export LANG=C - -source ~/anaconda/bin/activate py36 From 0e7afa828153697390293a4469f78c5d0600caca Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Fri, 19 Apr 2019 14:13:27 +0530 Subject: [PATCH 37/49] Update path.sh --- egs/fisher_callhome_spanish/s5/path.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh index c4b93124d7c..201edd95876 100755 --- a/egs/fisher_callhome_spanish/s5/path.sh +++ b/egs/fisher_callhome_spanish/s5/path.sh @@ -3,9 +3,9 @@ export KALDI_ROOT=`pwd`/../../../ export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh -export LD_LIBRARY_PATH=/home/dpovey/libs +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk export PATH=$SPARROWHAWK_ROOT/bin:$PATH -export LC_ALL=C -export LANG=C +export LC_ALL=C.UTF-8 +export LANG=C.UTF-8 From 56d2db9a085cbf55e1345efcf4d10f78fef20c72 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Fri, 19 Apr 2019 14:20:54 +0530 Subject: [PATCH 38/49] Update install_sparrowhawk.sh --- tools/install_sparrowhawk.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/install_sparrowhawk.sh b/tools/install_sparrowhawk.sh index f9bbcb1b28e..645577d4f3b 100755 --- a/tools/install_sparrowhawk.sh +++ b/tools/install_sparrowhawk.sh @@ -4,6 +4,7 @@ export CXXFLAGS="-I`pwd`/openfst/include" stage=0 if [ $stage -le 0 ] ; then + rm -rf re2 protobuf sparrowhawk* git clone -b feature/Spanish_normalizer https://github.com/spokencloud/sparrowhawk-resources.git || exit 1; patch -p0 < sparrowhawk-resources/local/Makefile.patch || exit 1; make openfst || exit 1; @@ -57,15 +58,14 @@ if [ $stage -le 1 ]; then fi if [ $stage -le 2 ]; then - source ~/anaconda/bin/activate py27 || exit 1; cp -r sparrowhawk-resources/language-resources sparrowhawk/ || exit 1; cd sparrowhawk/language-resources/en/textnorm/classifier || exit 1; . ./path.sh || exit 1; - python create_far.py ascii.syms universal_depot_ascii universal_depot universal_depot.far + python2 create_far.py ascii.syms universal_depot_ascii universal_depot universal_depot.far thraxmakedep tokenize_and_classify.grm || exit 1; make || exit 1; cd ../verbalizer - python create_far.py ascii.syms number_names_depot_ascii number_names_depot number_names_depot.far + python2 create_far.py ascii.syms number_names_depot_ascii number_names_depot number_names_depot.far cp -r ../classifier/universal_depot.far . thraxmakedep verbalize.grm || exit 1; make || exit 1; From fb6693e795d861ff47b656806e62cb93fdb1751d Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Sat, 20 Apr 2019 20:26:09 +0530 Subject: [PATCH 39/49] Set lang to ESP --- tools/install_sparrowhawk.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/install_sparrowhawk.sh b/tools/install_sparrowhawk.sh index 645577d4f3b..b6a7af211f5 100755 --- a/tools/install_sparrowhawk.sh +++ b/tools/install_sparrowhawk.sh @@ -59,7 +59,7 @@ fi if [ $stage -le 2 ]; then cp -r sparrowhawk-resources/language-resources sparrowhawk/ || exit 1; - cd sparrowhawk/language-resources/en/textnorm/classifier || exit 1; + cd sparrowhawk/language-resources/esp/textnorm/classifier || exit 1; . ./path.sh || exit 1; python2 create_far.py ascii.syms universal_depot_ascii universal_depot universal_depot.far thraxmakedep tokenize_and_classify.grm || exit 1; From ce0f42012583c8506d47d3b4a994a3085411ebc1 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Tue, 23 Apr 2019 13:35:33 +0530 Subject: [PATCH 40/49] Set pocolm option - --limit-unk-history=true --- egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh index 0e71be29119..0a5649c2a79 100755 --- a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh +++ b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh @@ -52,10 +52,10 @@ bypass_metaparam_optim_opt= #for order in 3; do #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done -limit_unk_history_opt= +#limit_unk_history_opt= # If you want to limit the left of in the history of a n-gram # un-comment the following line -#limit_unk_history_opt="--limit-unk-history=true" +limit_unk_history_opt="--limit-unk-history=true" for order in ${ngram_order}; do # decide on the vocabulary. From 9487ce1fa4219273a538e3427d6fb8a0d0005c6b Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Tue, 23 Apr 2019 09:39:08 -0400 Subject: [PATCH 41/49] Removed unused code --- egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh index 9e9e6efe7df..b6723c8a523 100755 --- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh +++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh @@ -255,12 +255,6 @@ if [ $stage -le 21 ]; then fi -# Let's train first a small RNNLM on Fisher train set -rnnlmdir=exp/rnnlm_lstm_tdnn_1b -if [ $stage -le 22 ]; then - rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1; -fi - if [ $stage -le 23 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) rm $dir/.error 2>/dev/null || true @@ -283,8 +277,6 @@ if [ $stage -le 23 ]; then bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \ ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1; fi - bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \ - ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1; ) || touch $dir/.error & done wait From 25609c53cb1c6871dfe16595e57f7a8a0ebd7d5b Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Tue, 23 Apr 2019 14:15:51 -0400 Subject: [PATCH 42/49] Fix in checking for empty space lines in lexicon --- egs/fisher_callhome_spanish/s5/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 17ef6313e5e..07eeddac78e 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -119,7 +119,7 @@ if [ $stage -le 2 ]; then echo "Fail to train the G2P model." && exit 1; fi steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex - cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^$/d" |sort | uniq > "$rnnlm_workdir"/lexicon_extended.txt + cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^[[:space:]]*$/d" | sort | uniq > "$rnnlm_workdir"/lexicon_extended.txt cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version. utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang From 510db0f6c72dc6c7b223400a9e203ecdacd3d390 Mon Sep 17 00:00:00 2001 From: saikiranvalluri Date: Thu, 25 Apr 2019 07:39:30 -0400 Subject: [PATCH 43/49] Fix in RNNLM rescoring decode stage --- egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh index b6723c8a523..3e400914521 100755 --- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh +++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh @@ -274,6 +274,7 @@ if [ $stage -le 23 ]; then $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1; done if [ $gigaword_workdir ]; then + lmtype=fsp_train bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \ ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1; fi From 9894f4c7b48d34c0511fb430565436a747b99f9c Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Sat, 27 Apr 2019 00:24:03 +0530 Subject: [PATCH 44/49] Update run.sh --- egs/fisher_callhome_spanish/s5/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 07eeddac78e..27a5f2aef82 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -13,7 +13,7 @@ start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to # perform the A, A + G, Dev type POCOLM training configuration. # A=fsp train, G=gigword text, -num_words_pocolm=110000 +num_words_pocolm=100000 train_sgmm2=false # call the next line with the directory where the Spanish Fisher data is From 3bdb541f769c3432c9df6ed6007861275b1a30c8 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Mon, 20 May 2019 19:01:08 +0530 Subject: [PATCH 45/49] Update clean_txt_dir.sh --- egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh index 1880b3a90cb..5d25e3a3fd2 100755 --- a/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh +++ b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh @@ -44,7 +44,7 @@ if [ $stage -le 0 ]; then $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \ local/run_norm.sh \ sparrowhawk_configuration.ascii_proto \ - $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \ + $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \ $outdir/data \ JOB \ $outdir/sparrowhawk/ From 6636557c72884771bfb656e302510e6ab4074c91 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Sun, 9 Jun 2019 10:53:51 +0530 Subject: [PATCH 46/49] Update run.sh --- egs/fisher_callhome_spanish/s5/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 27a5f2aef82..c1d20134b50 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -27,8 +27,8 @@ callhome_speech=/export/corpora/LDC/LDC96S35 callhome_transcripts=/export/corpora/LDC/LDC96T17 split_callhome=local/splits/split_callhome -gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data -rnnlm_workdir=workdir_rnnlm_Spanish_08032019 +gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data +rnnlm_workdir=workdir_rnnlm_Spanish_gigaword mfccdir=`pwd`/mfcc . ./cmd.sh From 36499a74b2da109302a74b7eb8f7fdc5aa670bda Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Sun, 7 Jul 2019 19:44:08 +0530 Subject: [PATCH 47/49] Update run.sh --- egs/fisher_callhome_spanish/s5/run.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index c1d20134b50..70d4d0555a4 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -6,6 +6,9 @@ stage=-1 lmstage=-2 + +# GIGAWORD RNNLM training based options below. +# GIGAWORD RAW CORPUS DATA is assumed to be already downloaded in the gigaword_datapath. train_rnnlm=false start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder. # If you already have the normalised gigword text somewhere, you can bypass the @@ -27,7 +30,7 @@ callhome_speech=/export/corpora/LDC/LDC96S35 callhome_transcripts=/export/corpora/LDC/LDC96T17 split_callhome=local/splits/split_callhome -gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data +gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data # GIGAWORD RAW CORPUS DATA DOWNLOAD PATH rnnlm_workdir=workdir_rnnlm_Spanish_gigaword mfccdir=`pwd`/mfcc From 8da5c3e97053bd46d6855f761d5cb3d7a185106f Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Sat, 13 Jul 2019 11:01:49 +0530 Subject: [PATCH 48/49] Reverse the order of Abbreviation process after punct syms --- egs/fisher_callhome_spanish/s5/local/run_norm.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/local/run_norm.sh b/egs/fisher_callhome_spanish/s5/local/run_norm.sh index f88fecc815c..a1a171a5ba6 100755 --- a/egs/fisher_callhome_spanish/s5/local/run_norm.sh +++ b/egs/fisher_callhome_spanish/s5/local/run_norm.sh @@ -24,13 +24,16 @@ for i in "${punctuation_symbols[@]}"; do num_syms=$((num_syms+1)) done mkdir -p $dir/normalize/$job -local/clean_abbrevs_text.py $data/$job $data/"$job"_processed -mv $data/"$job"_processed $data/$job + echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh bash $dir/normalize/$job/substitute.sh | \ sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \ sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' > $dir/normalize/$job/text + +local/clean_abbrevs_text.py $dir/normalize/$job/text $data/"$job"_processed +mv $data/"$job"_processed $dir/normalize/$job/text + normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt exit 0; From 510b415d30fdb553ed26fe6907cc37a1118b4d01 Mon Sep 17 00:00:00 2001 From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com> Date: Wed, 21 Aug 2019 13:36:11 +0530 Subject: [PATCH 49/49] Update run_norm.sh --- egs/fisher_callhome_spanish/s5/local/run_norm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/local/run_norm.sh b/egs/fisher_callhome_spanish/s5/local/run_norm.sh index a1a171a5ba6..839636ea21a 100755 --- a/egs/fisher_callhome_spanish/s5/local/run_norm.sh +++ b/egs/fisher_callhome_spanish/s5/local/run_norm.sh @@ -29,10 +29,10 @@ echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh bash $dir/normalize/$job/substitute.sh | \ sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \ - sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' > $dir/normalize/$job/text + sed "s: \s*: :g" > $dir/normalize/$job/text local/clean_abbrevs_text.py $dir/normalize/$job/text $data/"$job"_processed -mv $data/"$job"_processed $dir/normalize/$job/text +tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' < $data/"$job"_processed > $dir/normalize/$job/text normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt