From cbc8eeb3a4e75fa4d3679ece969a9a65c99bb8ad Mon Sep 17 00:00:00 2001
From: saikiran valluri <svalluri@c03.clsp.jhu.edu>
Date: Tue, 19 Feb 2019 00:02:18 -0500
Subject: [PATCH 01/49] Spanish Gigaword LM recipe

---
 .../s5_gigaword/cmd.sh                        |  15 +
 .../s5_gigaword/conf/decode.config            |   6 +
 .../s5_gigaword/conf/mfcc.conf                |   2 +
 .../s5_gigaword/conf/mfcc_hires.conf          |  10 +
 .../s5_gigaword/conf/online_cmvn.conf         |   1 +
 .../s5_gigaword/conf/plp.conf                 |   2 +
 .../local/callhome_create_splits.sh           |  31 +
 .../s5_gigaword/local/callhome_data_prep.sh   | 163 ++++
 .../s5_gigaword/local/callhome_get_1_best.py  |  75 ++
 .../local/callhome_get_lattices.py            | 115 +++
 .../local/callhome_make_spk2gender.sh         |  29 +
 .../s5_gigaword/local/callhome_make_trans.pl  |  74 ++
 .../s5_gigaword/local/callhome_text_pp.sh     |   9 +
 .../s5_gigaword/local/chain/run_tdnn_1g.sh    | 288 +++++++
 .../s5_gigaword/local/clean_txt_dir.sh        |  51 ++
 .../s5_gigaword/local/create_oracle_ctm.sh    |  30 +
 .../s5_gigaword/local/create_splits.sh        |  30 +
 .../s5_gigaword/local/ctm.sh                  |  34 +
 .../s5_gigaword/local/decode_report.py        | 148 ++++
 .../s5_gigaword/local/find_unique_phones.pl   |  25 +
 .../s5_gigaword/local/fix_stm.sh              |  10 +
 .../flatten_gigaword/flatten_all_gigaword.sh  |  15 +
 .../flatten_gigaword/flatten_one_gigaword.py  |  61 ++
 .../local/flatten_gigaword/run_flat.sh        |  17 +
 .../s5_gigaword/local/fsp_create_test_lang.sh |  49 ++
 .../s5_gigaword/local/fsp_data_prep.sh        | 175 ++++
 .../local/fsp_ideal_data_partitions.pl        |  85 ++
 .../s5_gigaword/local/fsp_make_spk2gender.sh  |  29 +
 .../s5_gigaword/local/fsp_make_trans.pl       |  81 ++
 .../s5_gigaword/local/fsp_prepare_dict.sh     | 142 ++++
 .../s5_gigaword/local/fsp_train_lms.sh        | 140 ++++
 .../s5_gigaword/local/get_1_best.py           |  62 ++
 .../s5_gigaword/local/get_data_weights.pl     |  39 +
 .../s5_gigaword/local/get_lattices.py         | 115 +++
 .../s5_gigaword/local/get_oracle.sh           |  32 +
 .../s5_gigaword/local/isolate_phones.pl       |  66 ++
 .../s5_gigaword/local/latconvert.sh           | 124 +++
 .../s5_gigaword/local/merge_lexicons.py       |  65 ++
 .../s5_gigaword/local/monitor_denlats.sh      |  31 +
 .../local/nnet3/run_ivector_common.sh         | 187 +++++
 .../s5_gigaword/local/pocolm_cust.sh          | 117 +++
 .../s5_gigaword/local/process_oracle.py       |  64 ++
 .../s5_gigaword/local/rescore.sh              |  24 +
 .../s5_gigaword/local/rnnlm.sh                |  84 ++
 .../s5_gigaword/local/run_norm.sh             |  33 +
 .../s5_gigaword/local/run_sgmm2x.sh           |  57 ++
 .../s5_gigaword/local/score.sh                |   1 +
 .../s5_gigaword/local/score_oracle.sh         |  29 +
 .../s5_gigaword/local/splits/dev              |  20 +
 .../local/splits/split_callhome/dev           |  20 +
 .../local/splits/split_callhome/test          |  20 +
 .../local/splits/split_callhome/train         |  80 ++
 .../s5_gigaword/local/splits/split_fisher/dev |  20 +
 .../local/splits/split_fisher/dev2            |  20 +
 .../local/splits/split_fisher/test            |  20 +
 .../local/splits/split_fisher/train           | 759 ++++++++++++++++++
 .../s5_gigaword/local/splits/test             |  20 +
 .../s5_gigaword/local/splits/train            |  80 ++
 .../s5_gigaword/local/spron.pl                | 304 +++++++
 .../s5_gigaword/local/subset_data_prep.sh     | 164 ++++
 .../s5_gigaword/local/train_get_1_best.py     |  79 ++
 .../s5_gigaword/local/train_get_lattices.py   | 125 +++
 .../s5_gigaword/local/train_pocolm.sh         |  39 +
 .../s5_gigaword/local/train_process_oracle.py |  79 ++
 .../s5_gigaword/local/wer_output_filter       |   5 +
 .../s5_gigaword/path.sh                       |  13 +
 .../s5_gigaword/path_venv.sh                  |  13 +
 egs/fisher_callhome_spanish/s5_gigaword/rnnlm |   1 +
 .../s5_gigaword/run.sh                        | 299 +++++++
 egs/fisher_callhome_spanish/s5_gigaword/steps |   1 +
 egs/fisher_callhome_spanish/s5_gigaword/utils |   1 +
 71 files changed, 5254 insertions(+)
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/cmd.sh
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh
 create mode 120000 egs/fisher_callhome_spanish/s5_gigaword/local/score.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/test
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/train
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/path.sh
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh
 create mode 120000 egs/fisher_callhome_spanish/s5_gigaword/rnnlm
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/run.sh
 create mode 120000 egs/fisher_callhome_spanish/s5_gigaword/steps
 create mode 120000 egs/fisher_callhome_spanish/s5_gigaword/utils

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh
new file mode 100755
index 00000000000..0511bd2bbb0
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl"
+export decode_cmd="retry.pl queue.pl --mem 8G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config b/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config
new file mode 100644
index 00000000000..7908f178373
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config
@@ -0,0 +1,6 @@
+# Use wider-than-normal decoding beams.
+first_beam=16.0
+beam=20.0
+lat_beam=10.0
+min_lmwt=2
+max_lmwt=10
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf
new file mode 100644
index 00000000000..ffb41a1aae4
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=8000
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..d870ab04c38
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf
new file mode 100644
index 00000000000..c4b73674cab
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf
@@ -0,0 +1,2 @@
+# No non-default options for now.
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh
new file mode 100755
index 00000000000..07814da46a9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+data_dir=data
+train_all=data/callhome_train_all
+
+if [ $# -lt 1 ]; then
+    echo "Specify the location of the split files"
+    exit 1;
+fi
+
+splitFile=$1
+
+# Train first
+for split in train dev test
+do
+  dirName=callhome_$split
+
+  cp -r $train_all $data_dir/$dirName
+
+  awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \
+  $splitFile/$split $train_all/segments > $data_dir/$dirName/segments
+
+  n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $data_dir/$dirName/segments | sort | uniq | wc -l`
+
+  echo "$n conversations left in split $dirName"
+
+  utils/fix_data_dir.sh $data_dir/$dirName
+  utils/validate_data_dir.sh $data_dir/$dirName
+done
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh
new file mode 100755
index 00000000000..f61b0fa9519
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+#
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+# The input is the Callhome Spanish Dataset. (*.sph files)
+# In addition the transcripts are needed as well.
+# To be run from one directory above this script.
+
+# Note: when creating your own data preparation scripts, it's a good idea
+# to make sure that the speaker id (if present) is a prefix of the utterance
+# id, that the output scp file is sorted on utterance id, and that the
+# transcription file is exactly the same length as the scp file and is also
+# sorted on utterance id (missing transcriptions should be removed from the
+# scp file using e.g. scripts/filter_scp.pl)
+
+stage=0
+
+export LC_ALL=C
+
+
+if [ $# -lt 2 ]; then
+   echo "Arguments should be the location of the Callhome Spanish Speech and Transcript Directories, se
+e ../run.sh for example."
+   exit 1;
+fi
+
+cdir=`pwd`
+dir=`pwd`/data/local/data
+local=`pwd`/local
+utils=`pwd`/utils
+tmpdir=`pwd`/data/local/tmp
+
+. ./path.sh || exit 1; # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+   exit 1;
+fi
+cd $dir
+
+# Make directory of links to the WSJ disks such as 11-13.1.  This relies on the command
+# line arguments being absolute pathnames.
+#rm -r links/ 2>/dev/null
+mkdir -p links/
+ln -s $* links
+
+# Basic spot checks to see if we got the data that we needed
+if [ ! -d links/LDC96S35 -o ! -d links/LDC96T17 ];
+then
+        echo "The speech and the data directories need to be named LDC96S35 and LDC96T17 respecti
+vely"
+        exit 1;
+fi
+
+if [ ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN ];
+then
+        echo "Dev, Eval or Train directories missing or not properly organised within the speech data dir"
+        exit 1;
+fi
+
+#Check the transcripts directories as well to see if they exist
+if [ ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/train ]
+then
+        echo "Transcript directories missing or not properly organised"
+        exit 1;
+fi
+
+speech_train=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN
+speech_dev=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST
+speech_test=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST
+transcripts_train=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/train
+transcripts_dev=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest
+transcripts_test=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest
+
+fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l`
+fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l`
+fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l`
+fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l`
+fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l`
+fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l`
+
+#Now check if we got all the files that we needed
+if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ];
+then
+        echo "Incorrect number of files in the data directories"
+        echo "The paritions should contain 80/20/20 files"
+        exit 1;
+fi
+
+if [ $stage -le 0 ]; then
+  #Gather all the speech files together to create a file list
+  (
+      find $speech_train -iname '*.sph';
+      find $speech_dev -iname '*.sph';
+      find $speech_test -iname '*.sph';
+  )  > $tmpdir/callhome_train_sph.flist
+
+  #Get all the transcripts in one place
+
+  (
+    find $transcripts_train -iname '*.txt';
+    find $transcripts_dev -iname '*.txt';
+    find $transcripts_test -iname '*.txt';
+  )  > $tmpdir/callhome_train_transcripts.flist
+
+fi
+
+if [ $stage -le 1 ]; then
+  $local/callhome_make_trans.pl $tmpdir
+  mkdir -p $dir/callhome_train_all
+  mv $tmpdir/callhome_reco2file_and_channel $dir/callhome_train_all/
+fi
+
+if [ $stage -le 2 ]; then
+  sort $tmpdir/callhome.text.1 | sed 's/^\s\s*|\s\s*$//g' | sed 's/\s\s*/ /g' > $dir/callhome_train_all/callhome.text
+
+  #Create segments file and utt2spk file
+  ! cat $dir/callhome_train_all/callhome.text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/callhome_train_all/callhome_utt2spk \
+  && echo "Error producing utt2spk file" && exit 1;
+
+  cat $dir/callhome_train_all/callhome.text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2;
+ $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' >$dir/callhome_train_all/callhome_segments
+
+  $utils/utt2spk_to_spk2utt.pl <$dir/callhome_train_all/callhome_utt2spk > $dir/callhome_train_all/callhome_spk2utt
+fi
+
+if [ $stage -le 3 ]; then
+  for f in `cat $tmpdir/callhome_train_sph.flist`; do
+    # convert to absolute path
+    make_absolute.sh $f
+  done > $tmpdir/callhome_train_sph_abs.flist
+
+  cat $tmpdir/callhome_train_sph_abs.flist | perl -ane 'm:/([^/]+)\.SPH$: || die "bad line $_; ";  print lc($1)," $_"; ' > $tmpdir/callhome_sph.scp
+  cat $tmpdir/callhome_sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
+  sort -k1,1 -u  > $dir/callhome_train_all/callhome_wav.scp || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl.
+  cd $cdir
+  #TODO: needs to be rewritten
+  $local/callhome_make_spk2gender.sh > $dir/callhome_train_all/callhome_spk2gender
+fi
+
+# Rename files from the callhome directory
+if [ $stage -le 5 ]; then
+    cd $dir/callhome_train_all
+    mv callhome.text text
+    mv callhome_segments segments
+    mv callhome_spk2utt spk2utt
+    mv callhome_wav.scp wav.scp
+    mv callhome_reco2file_and_channel reco2file_and_channel
+    mv callhome_spk2gender spk2gender
+    mv callhome_utt2spk utt2spk
+    cd $cdir
+fi
+
+fix_data_dir.sh $dir/callhome_train_all || exit 1
+utils/validate_data_dir.sh --no-feats $dir/callhome_train_all || exit 1
+
+echo "CALLHOME spanish Data preparation succeeded."
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py
new file mode 100755
index 00000000000..a81818c2858
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+# Extracts one best output for a set of files
+# The list of files in the conversations for which 1 best output has to be extracted
+# words.txt
+
+import os
+import sys
+
+def findTranscription(timeDetail):
+  file1 = open('exp/tri5a/decode_callhome_dev/scoring/13.tra')
+  file2 = open('exp/tri5a/decode_callhome_train/scoring/13.tra')
+  for line in file1:
+    lineComp = line.split()
+    if lineComp[0] == timeDetail:
+      return " ".join(lineComp[1:])
+  for line in file2:
+    lineComp = line.split()
+    if lineComp[0] == timeDetail:
+      return " ".join(lineComp[1:])
+  # No result found
+  return -1
+
+
+wordsFile = open('exp/tri5a/graph/words.txt')
+words = {}
+
+# Extract word list
+for line in wordsFile:
+  lineComp = line.split()
+  words[int(lineComp[1])] = lineComp[0].strip()
+
+# Now read list of files in conversations
+fileList = []
+#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev')
+conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/train')
+for line in conversationList:
+    line = line.strip()
+    line = line[:-4]
+    fileList.append(line)
+
+# IN what order were the conversations added to the spanish files?
+# TODO: Make sure they match the order in which these english files are being written
+
+# Now get timing information to concatenate the ASR outputs
+if not os.path.exists('exp/tri5a/one-best/ch_train'):
+  os.makedirs('exp/tri5a/one-best/ch_train')
+
+#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+')
+provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/asr.train', 'w+')
+for item in fileList:
+  timingFile = open('/export/a04/gkumar/corpora/fishcall/callhome/tim/' + item + '.es')
+  newFile = open('exp/tri5a/one-best/ch_train/' + item + '.es', 'w+')
+  for line in timingFile:
+    timeInfo = line.split()
+    mergedTranslation = ""
+    for timeDetail in timeInfo:
+      #Locate this in ASR dev/test, this is going to be very slow
+      tmp = findTranscription(timeDetail)
+      if tmp != -1:
+        mergedTranslation = mergedTranslation + " " + tmp
+    mergedTranslation = mergedTranslation.strip()
+    transWords = [words[int(x)] for x in mergedTranslation.split()]
+    newFile.write(" ".join(transWords) + "\n")
+    provFile.write(" ".join(transWords) + "\n")
+
+  newFile.close()
+provFile.close()
+
+
+
+
+
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py
new file mode 100755
index 00000000000..4c96e01ce7e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+# Extracts one best output for a set of files
+# The list of files in the conversations for which 1 best output has to be extracted
+# words.txt
+
+from __future__ import print_function
+import os
+import sys
+import subprocess
+
+latticeLocation = 'latjosh-2-callhome/lattices-pushed/'
+
+tmpdir = 'data/local/data/tmp/ch-d/lattmp'
+invalidplfdir = 'data/local/data/tmp/ch-d/invalidplf'
+symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt'
+
+conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/dev')
+provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/asr.test.plf', 'w+')
+invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/invalidPLF', 'w+')
+blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/blankPLF', 'w+')
+rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/removeLines', 'w+')
+
+if not os.path.exists(tmpdir):
+    os.makedirs(tmpdir)
+if not os.path.exists(invalidplfdir):
+    os.makedirs(invalidplfdir)
+else:
+    os.system("rm " + invalidplfdir + "/*")
+
+def latticeConcatenate(lat1, lat2):
+    '''
+    Concatenates lattices, writes temporary results to tmpdir
+    '''
+    if lat1 == "":
+        os.system('rm ' + tmpdir + '/tmp.lat')
+        return lat2
+    else:
+        proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')])
+        proc.wait()
+        return tmpdir + '/tmp.lat'
+
+
+def findLattice(timeDetail):
+    '''
+    Finds the lattice corresponding to a time segment
+    '''
+    if os.path.isfile(latticeLocation + timeDetail + '.lat'):
+        return latticeLocation + timeDetail + '.lat'
+    else:
+        return -1
+
+
+# Now read list of files in conversations
+fileList = []
+for line in conversationList:
+    line = line.strip()
+    line = line[:-4]
+    fileList.append(line)
+
+# IN what order were the conversations added to the spanish files?
+# Now get timing information to concatenate the ASR outputs
+
+lineNo = 1
+for item in fileList:
+    timingFile = open('/export/a04/gkumar/corpora/fishcall/callhome/tim/' + item + '.es')
+    for line in timingFile:
+        timeInfo = line.split()
+
+        # For utterances that are concatenated in the translation file,
+        # the corresponding FSTs have to be translated as well
+        mergedTranslation = ""
+        for timeDetail in timeInfo:
+            tmp = findLattice(timeDetail)
+            if tmp != -1:
+                # Concatenate lattices
+                mergedTranslation = latticeConcatenate(mergedTranslation, tmp)
+
+        print(mergedTranslation)
+        if mergedTranslation != "":
+
+            # Sanjeev's Recipe : Remove epsilons and topo sort
+            finalFST = tmpdir + "/final.fst"
+            os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST)
+
+            # Now convert to PLF
+            proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable +  ' ' + finalFST, stdout=subprocess.PIPE, shell=True)
+            PLFline = proc.stdout.readline()
+            finalPLFFile = tmpdir + "/final.plf"
+            finalPLF = open(finalPLFFile, "w+")
+            finalPLF.write(PLFline)
+            finalPLF.close()
+
+            # now check if this is a valid PLF, if not write it's ID in a
+            # file so it can be checked later
+            proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
+            line = proc.stdout.readline()
+            print("{} {}".format(line, lineNo))
+            if line.strip() != "PLF format appears to be correct.":
+                os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
+                invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
+                rmLines.write("{}\n".format(lineNo))
+            else:
+                provFile.write(PLFline)
+        else:
+            blankPLF.write(timeInfo[0] + "\n")
+            rmLines.write("{}\n".format(lineNo))
+        # Now convert to PLF
+        lineNo += 1
+
+provFile.close()
+invalidPLF.close()
+blankPLF.close()
+rmLines.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh
new file mode 100755
index 00000000000..d06e5fe911f
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+# Gets the unique speakers from the file created by fsp_make_trans.pl
+# Note that if a speaker appears multiple times, it is categorized as female
+
+import os
+import sys
+
+tmpFileLocation = 'data/local/tmp/callhome_spk2gendertmp'
+
+tmpFile = None
+
+try:
+     tmpFile = open(tmpFileLocation)
+except IOError:
+    print 'The file spk2gendertmp does not exist. Run fsp_make_trans.pl first?'
+
+speakers = {}
+
+for line in tmpFile:
+    comp = line.split(' ')
+    if comp[0] in speakers:
+        speakers[comp[0]] = "f"
+    else:
+        speakers[comp[0]] = comp[1]
+
+for speaker, gender in speakers.iteritems():
+    print speaker + " " + gender
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl
new file mode 100755
index 00000000000..ec3dfd88037
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl
@@ -0,0 +1,74 @@
+#!/usr/bin/env perl
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+use utf8;
+use File::Basename;
+
+($tmpdir)=@ARGV;
+$trans="$tmpdir/callhome_train_transcripts.flist";
+$reco="$tmpdir/callhome_reco2file_and_channel";
+open(T, "<", "$trans") || die "Can't open transcripts file";
+open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!";
+open(O, ">$tmpdir/callhome.text.1") || die "Can't open text file for writing";
+open(G, ">$tmpdir/callhome_spk2gendertmp") || die "Can't open the speaker to gender map file";
+binmode(O, ":utf8");
+while (<T>) {
+  $file = $_;
+  m:([^/]+)\.txt: || die "Bad filename $_";
+  $call_id = $1;
+  print R "$call_id-A $call_id A\n";
+  print R "$call_id-B $call_id B\n";
+  open(I, "<$file") || die "Opening file $_";
+  binmode(I, ":iso88591");
+  #Now read each line and extract information
+  while (<I>) {
+        #136.37 138.10 B: Ah, bueno, mamita.
+    chomp;
+
+    my @stringComponents = split(":", $_, 2);
+          my @timeInfo = split(" ", $stringComponents[0]);
+          $stringComponents[1] =~ s/^\s+|\s+$//g ;
+          my $words = $stringComponents[1];
+    #Check number of components in this array
+    if ((scalar @stringComponents) >= 2) {
+      $start = sprintf("%06d", $timeInfo[0] * 100);
+      $end = sprintf("%06d", $timeInfo[1] * 100);
+      length($end) > 6 && die "Time too long $end in $file";
+      $side = "A";
+      if (index($timeInfo[2], "B") != -1) {
+        $side = "B";
+      }
+      $utt_id = "${call_id}-$side-$start-$end";
+      $speaker_id = "${call_id}-$side";
+      # All speakers are treated as male because speaker gender info
+      # is missing in this file
+      $gender = "m";
+      print G "$speaker_id $gender\n" || die "Error writing to speaker2gender file";
+                        $words =~ s|\[\[[^]]*\]\]||g;    #removes comments
+                        $words =~ s|\{laugh\}|\$laughter\$|g;    # replaces laughter tmp
+                        $words =~ s|\[laugh\]|\$laughter\$|g;    # replaces laughter tmp
+                        $words =~ s|\{[^}]*\}|\[noise\]|g;       # replaces noise
+                        $words =~ s|\[[^]]*\]|\[noise\]|g;       # replaces noise
+                        $words =~ s|\[/*([^]]*)\]|\[noise\]|g;   # replaces end of noise
+                        $words =~ s|\$laughter\$|\[laughter\]|g; # replaces laughter again
+                        $words =~ s|\(\(([^)]*)\)\)|\1|g;        # replaces unintelligible speech
+                        $words =~ s|<\?([^>]*)>|\1|g;            # for unrecognized language
+                        $words =~ s|background speech|\[noise\]|g;
+                        $words =~ s|background noise|\[noise\]|g;
+                        $words =~ s/\[/larrow/g;
+                        $words =~ s/\]/rarrow/g;
+                        $words =~ s/[[:punct:]]//g;
+                        $words =~ s/larrow/\[/g;
+                        $words =~ s/rarrow/\]/g;
+      $words =~ s/[¿¡]//g;
+                        $words =~ s/\h+/ /g;                     # horizontal whitespace characters
+      $words = lc($words);
+      print O "$utt_id $words\n" || die "Error writing to text file";
+    }
+  }
+  close(I);
+}
+close(T);
+close(R);
+close(O);
+close(G);
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh
new file mode 100755
index 00000000000..37e1eca1687
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+if [ $# -gt 0 ]; then
+    sentence=$1
+    echo $sentence | sed 's:{^[}]*}:[noise]:'
+fi
+
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
new file mode 100755
index 00000000000..c487f1bd222
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+# 1g is like 1f but upgrading to a "resnet-style TDNN-F model", i.e.
+#   with bypass resnet connections, and re-tuned.
+# compute-wer --text --mode=present ark:exp/chain/multipsplice_tdnn/decode_fsp_train_test/scoring_kaldi/test_filt.txt ark,p:- 
+# %WER 22.21 [ 8847 / 39831, 1965 ins, 2127 del, 4755 sub ]
+# %SER 56.98 [ 3577 / 6278 ]
+# Scored 6278 sentences, 0 not present in hyp.
+
+# steps/info/chain_dir_info.pl  exp/chain/multipsplice_tdnn
+# exp/chain/multipsplice_tdnn: num-iters=296 nj=1..2 num-params=8.2M dim=40+100->2489 combine=-0.170->-0.165 (over 8) xent:train/valid[196,295,final]=(-2.30,-1.93,-1.83/-2.24,-1.96,-1.86) logprob:train/valid[196,295,final]=(-0.208,-0.169,-0.164/-0.189,-0.161,-0.158)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="test dev"
+gmm=tri5a        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1g   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if [ $stage -le 15 ]; then
+echo "local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix""
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+fi
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/tri5a_lats_nodup_sp
+dir=exp/chain/multipsplice_tdnn
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3/ivectors_train_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain/${gmm}_tree
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_${gmm}_chain
+
+#for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+#    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+#    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+#  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+#done
+
+
+if [ $stage -le 16 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 17 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 18 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 19 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 20 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand $srand \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.frames-per-iter 5000000 \
+    --trainer.optimization.num-jobs-initial 1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context 0 \
+    --egs.chunk-right-context 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs $remove_egs \
+    --use-gpu true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir exp/tri5a_lats_nodup_sp \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 21 ]; then
+  # The reason we are using data/lang_test here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  #LM was trained only on Fisher Spanish train subset.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph_fsp_train || exit 1;
+
+fi
+
+rnnlmdir=exp/rnnlm_lstm_tdnn_1b
+if [ $stage -le 22 ]; then
+  local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
+fi
+
+if [ $stage -le 23 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype  in fsp_train; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
+      done
+      bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
+	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
new file mode 100755
index 00000000000..56891328a89
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Script to clean up gigaword LM text
+# Removes punctuations, does case normalization
+
+stage=0
+nj=500
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -euo pipefail
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <textdir> <outdir>"
+    exit 1;
+fi
+
+txtdir=$1
+textdir=$(realpath $txtdir)
+outdir=$(realpath $2)
+
+workdir=$outdir/tmp
+if [ $stage -le 0 ]; then
+  rm -rf $outdir
+  mkdir -p $workdir
+  mkdir -p $textdir/splits
+  mkdir -p $outdir/data
+  split -l 1000000 $textdir/in.txt $textdir/splits/out
+  numsplits=0
+  for x in $textdir/splits/*; do
+    numsplits=$((numsplits+1))
+    ln -s $x $outdir/data/$numsplits
+  done
+  echo $numsplits
+  cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt .
+  $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
+    local/run_norm.sh \
+    sparrowhawk_configuration.ascii_proto \
+    $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \
+    $outdir/data \
+    JOB \
+    $outdir/sparrowhawk/
+  cat $outdir/sparrowhawk/*.txt | sed "/^$/d"  > $outdir/text_normalized
+
+  # check if numbers are there in normalized output
+  awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \
+    $outdir/text_normalized > $outdir/unique_words
+  grep "[0-9]" $outdir/unique_words | sort -u >  $outdir/numbers
+fi
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh
new file mode 100755
index 00000000000..d48a96db5c4
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+# No sanity checks here, they need to be added
+
+data=data/callhome_test
+dir=exp/tri5a/decode_callhome_test
+lang=data/lang
+LMWT=13
+
+[ -f ./path.sh ] && . ./path.sh
+
+cmd=run.pl
+filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
+name=`basename $data`;
+model=$dir/../final.mdl # assume model one level up from decoding dir.
+symTable=$lang/words.txt
+
+if [ ! -f $dir/oracle/oracle.lat.gz ]; then
+    cat $data/text | utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \
+        lattice-oracle --write-lattices="ark:|gzip -c > $dir/oracle/oracle.lat.gz" \
+            "ark:gunzip -c $dir/lat.*.gz|" ark:- ark:- > /dev/null 2>&1
+fi
+
+lattice-align-words $lang/phones/word_boundary.int $model \
+    "ark:gunzip -c $dir/oracle/oracle.lat.gz|" ark:- | \
+    lattice-1best --lm-scale=$LMWT ark:- ark:- | nbest-to-ctm ark:- - | \
+    utils/int2sym.pl -f 5 $lang/words.txt | \
+    utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+        > $dir/oracle/$name.ctm
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh
new file mode 100755
index 00000000000..8a60dc9d422
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+data_dir=data
+train_all=data/train_all
+
+if [ $# -lt 1 ]; then
+    echo "Specify the location of the split files"
+    exit 1;
+fi
+
+splitFile=$1
+
+# Train first
+for split in train dev test dev2
+do
+
+  cp -r $train_all $data_dir/$split
+
+  awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \
+  $splitFile/$split $train_all/segments > $data_dir/$split/segments
+
+  n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $data_dir/$split/segments | sort | uniq | wc -l`
+
+  echo "$n conversations left in split $split"
+
+  utils/fix_data_dir.sh $data_dir/$split
+  utils/validate_data_dir.sh $data_dir/$split
+done
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh
new file mode 100755
index 00000000000..7d09f574580
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+. ./cmd.sh
+
+split=test
+data_dir=data/test
+decode_dir=exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it4/
+lang_dir=data/lang
+
+# Create the STM file
+# Always create this file before creating the CTM files so that
+# channel numbers are properly created.
+if [ ! -f $data_dir/stm ]; then
+    /export/a11/guoguo/babel/103-bengali-limitedLP.official/local/prepare_stm.pl $data_dir
+fi
+
+# Create the CTM file
+steps/get_ctm.sh $data_dir $lang_dir $decode_dir
+
+# Make sure that channel markers match
+#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
+
+# Get the environment variables
+. /export/babel/data/software/env.sh
+
+# Start scoring
+/export/a11/guoguo/babel/103-bengali-limitedLP.official/local/score_stm.sh $data_dir $lang_dir \
+    $decode_dir
+
+# Print a summary of the result
+grep "Percent Total Error" $decode_dir/score_*/$split.ctm.dtl
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py b/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py
new file mode 100755
index 00000000000..6f3d3f80c95
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+
+# Author : Gaurav Kumar (Johns Hopkins University)
+# Gets a report on what the best word error rate was and which iteration
+# led to it. This is needed both for reporting purposes and for setting
+# the acoustic scale weight which extracting lattices.
+# This script is specific to my partitions and needs to be made more general
+# or modified
+
+from __future__ import print_function
+import subprocess
+import os
+
+decode_directories = ['exp/tri5a/decode_dev',
+                        'exp/tri5a/decode_test',
+                        'exp/tri5a/decode_dev2',
+                        'exp/sgmm2x_6a/decode_dev_fmllr',
+                        'exp/sgmm2x_6a/decode_test_fmllr',
+                        'exp/sgmm2x_6a/decode_dev2_fmllr',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it1',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it2',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it3',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it4',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it1',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it2',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it3',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it4',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_it1',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_it2',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_it3',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_it4',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it1',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it2',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it3',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it4',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it1',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it2',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it3',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it4',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it1',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it2',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it3',
+                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it4'
+                        ]
+
+def get_best_wer(decode_dir):
+    best_iteration = 0
+    best_wer = 100.0
+    for i in range(16):
+        if os.path.isfile("{}/wer_{}".format(decode_dir, i)):
+            result = subprocess.check_output("tail -n 3 {}/wer_{}".format(decode_dir, i), shell=True)
+            wer_string = result.split("\n")[0]
+            wer_details = wer_string.split(' ')
+            # Get max WER
+            wer = float(wer_details[1])
+            if wer < best_wer:
+                best_wer = wer
+                best_iteration = i
+    return best_iteration, best_wer
+
+for decode_dir in decode_directories[:6]:
+    print(decode_dir)
+    print(get_best_wer(decode_dir))
+
+# Separate processing for bMMI stuff
+best_wer = 100.0
+best_dir = ""
+best_iteration = 0
+
+for decode_dir in decode_directories[6:10]:
+    iteration, wer = get_best_wer(decode_dir)
+    if wer < best_wer:
+        best_wer = wer
+        best_dir = decode_dir
+        best_iteration = iteration
+
+print(best_dir)
+print((best_iteration, best_wer))
+
+best_wer = 100.0
+best_dir = ""
+best_iteration = 0
+
+for decode_dir in decode_directories[10:14]:
+    iteration, wer = get_best_wer(decode_dir)
+    if wer < best_wer:
+        best_wer = wer
+        best_dir = decode_dir
+        best_iteration = iteration
+
+print(best_dir)
+print((best_iteration, best_wer))
+
+best_wer = 100.0
+best_dir = ""
+best_iteration = 0
+
+for decode_dir in decode_directories[14:18]:
+    iteration, wer = get_best_wer(decode_dir)
+    if wer < best_wer:
+        best_wer = wer
+        best_dir = decode_dir
+        best_iteration = iteration
+
+print(best_dir)
+print((best_iteration, best_wer))
+
+best_wer = 100.0
+best_dir = ""
+best_iteration = 0
+
+for decode_dir in decode_directories[18:22]:
+    iteration, wer = get_best_wer(decode_dir)
+    if wer <= best_wer:
+        best_wer = wer
+        best_dir = decode_dir
+        best_iteration = iteration
+
+print(best_dir)
+print((best_iteration, best_wer))
+
+best_wer = 100.0
+best_dir = ""
+best_iteration = 0
+
+for decode_dir in decode_directories[22:26]:
+    iteration, wer = get_best_wer(decode_dir)
+    if wer <= best_wer:
+        best_wer = wer
+        best_dir = decode_dir
+        best_iteration = iteration
+
+print(best_dir)
+print((best_iteration, best_wer))
+
+best_wer = 100.0
+best_dir = ""
+best_iteration = 0
+
+for decode_dir in decode_directories[26:]:
+    iteration, wer = get_best_wer(decode_dir)
+    if wer <= best_wer:
+        best_wer = wer
+        best_dir = decode_dir
+        best_iteration = iteration
+
+print(best_dir)
+print((best_iteration, best_wer))
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl
new file mode 100755
index 00000000000..2da41182d20
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+#Finds unique phones from the basic rules file
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+use utf8;
+
+($b)=$ARGV[0];
+($tmpdir)=$ARGV[1];
+open(BB, "<", "$b/basic_rules") || die "Can't open basic rules";
+binmode(BB, ":iso88591");
+open(O, ">$tmpdir/phones") || die "Can't open text file for writing";
+binmode(O, ":utf8");
+my %phones = qw();
+while (<BB>) {
+  chomp;
+  my @stringComponents = split(/\t/);
+  m/->\s(\S+)/;
+  my $phone = $1;
+  $phone =~ tr/áéíóú/aeiou/;
+  $phones{$phone} = 1;
+}
+foreach my $p (keys %phones) {
+  print O $p, "\n";
+}
+#print keys %phones;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh
new file mode 100755
index 00000000000..20220d107bc
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+# Fixes the CALLHOME stm files
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+data_dir=$1
+
+cat $data_dir/stm | awk '{$1=substr(tolower($1),0,length($1)-4);print;}' > $data_dir/stm_new
+mv $data_dir/stm $data_dir/stm.bak
+mv $data_dir/stm_new $data_dir/stm
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh
new file mode 100755
index 00000000000..242359e7c28
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+# Path to Gigaword corpus with all data files decompressed.
+export GIGAWORDDIR=$1
+# The directory to write output to
+export OUTPUTDIR=$2
+# The number of jobs to run at once
+export NUMJOBS=$3
+
+echo "Flattening Gigaword with ${NUMJOBS} processes..."
+mkdir -p $OUTPUTDIR
+find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \;
+echo "Combining the flattened files into one..."
+cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py
new file mode 100644
index 00000000000..29f6766dd84
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+import re
+import spacy
+import gzip
+
+from argparse import ArgumentParser
+from bs4 import BeautifulSoup
+
+en_nlp = spacy.load("es")
+
+
+def flatten_one_gigaword_file(file_path):
+    f = gzip.open(file_path)
+    html = f.read()
+    # Parse the text with BeautifulSoup
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Iterate over all <p> items and get the text for each.
+    all_paragraphs = []
+    for paragraph in soup("p"):
+        # Turn inter-paragraph newlines into spaces
+        paragraph = paragraph.get_text()
+        paragraph = re.sub(r"\n+", "\n", paragraph)
+        paragraph = paragraph.replace("\n", " ")
+        # Tokenize the paragraph into words
+        tokens = en_nlp.tokenizer(paragraph)
+        words = [str(token) for token in tokens if not
+                 str(token).isspace()]
+        if len(words) < 3:
+            continue
+        all_paragraphs.append(words)
+    # Return a list of strings, where each string is a
+    # space-tokenized paragraph.
+    return [" ".join(paragraph) for paragraph in all_paragraphs]
+
+
+if __name__ == "__main__":
+    log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+    logger = logging.getLogger(__name__)
+
+    parser = ArgumentParser(description=("Flatten a gigaword data file for "
+                                         "use in language modeling."))
+    parser.add_argument("--gigaword-path", required=True,
+                        metavar="<gigaword_path>", type=str,
+                        help=("Path to Gigaword directory, with "
+                              "all .gz files unzipped."))
+    parser.add_argument("--output-dir", required=True, metavar="<output_dir>",
+                        type=str, help=("Directory to write final flattened "
+                                        "Gigaword file."))
+
+    A = parser.parse_args()
+    all_paragraphs = flatten_one_gigaword_file(A.gigaword_path)
+    output_path = os.path.join(A.output_dir,
+                               os.path.basename(A.gigaword_path) + ".flat")
+    with open(output_path, "w") as output_file:
+        for paragraph in all_paragraphs:
+            output_file.write("{}\n".format(paragraph))
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh
new file mode 100755
index 00000000000..6b236be0ab9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -e
+
+. ./path_venv.sh
+
+# Path to Gigaword corpus with all data files decompressed.
+GIGAWORDPATH=$1
+# The directory to write output to
+OUTPUTDIR=$2
+file=$(basename ${GIGAWORDPATH})
+if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then
+    echo "flattening to ${OUTPUTDIR}/${file}.flat"
+    python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR}
+else
+    echo "skipping ${file}.flat"
+fi
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh
new file mode 100755
index 00000000000..fb765b57e69
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+#
+
+if [ -f path.sh ]; then . ./path.sh; fi
+
+mkdir -p data/lang_test
+
+arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
+[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
+
+mkdir -p data/lang_test
+cp -r data/lang/* data/lang_test
+
+gunzip -c "$arpa_lm" | \
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
+
+
+echo  "Checking how stochastic G is (the first of these numbers should be small):"
+fstisstochastic data/lang_test/G.fst
+
+## Check lexicon.
+## just have a look and make sure it seems sane.
+echo "First few lines of lexicon FST:"
+fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
+
+echo Performing further checks
+
+# Checking that G.fst is determinizable.
+fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
+
+# Checking that L_disambig.fst is determinizable.
+fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
+
+# Checking that disambiguated lexicon times G is determinizable
+# Note: we do this with fstdeterminizestar not fstdeterminize, as
+# fstdeterminize was taking forever (presumbaly relates to a bug
+# in this version of OpenFst that makes determinization slow for
+# some case).
+fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
+   fstdeterminizestar >/dev/null || echo Error
+
+# Checking that LG is stochastic:
+fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
+   fstisstochastic || echo "[log:] LG is not stochastic"
+
+
+echo "$0 succeeded"
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh
new file mode 100755
index 00000000000..11d65da3e95
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+#
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files)
+# In addition the transcripts are needed as well.
+# To be run from one directory above this script.
+
+# Note: when creating your own data preparation scripts, it's a good idea
+# to make sure that the speaker id (if present) is a prefix of the utterance
+# id, that the output scp file is sorted on utterance id, and that the
+# transcription file is exactly the same length as the scp file and is also
+# sorted on utterance id (missing transcriptions should be removed from the
+# scp file using e.g. scripts/filter_scp.pl)
+
+stage=0
+
+export LC_ALL=C
+
+
+if [ $# -lt 2 ]; then
+   echo "Usage: $0 <LDC2010S01-location> <LDC2010T04-location>"
+   echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04"
+   exit 1;
+fi
+
+cdir=`pwd`
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+tmpdir=`pwd`/data/local/tmp
+mkdir -p $tmpdir
+
+. ./path.sh || exit 1; # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+   exit 1;
+fi
+cd $dir
+
+# Make directory of links to the WSJ disks such as 11-13.1.  This relies on the command
+# line arguments being absolute pathnames.
+rm -r links/ 2>/dev/null
+mkdir links/
+ln -s $* links
+
+# Basic spot checks to see if we got the data that we needed
+if [ ! -d links/LDC2010S01 -o ! -d links/LDC2010T04 ];
+then
+        echo "The speech and the data directories need to be named LDC2010S01 and LDC2010T04 respecti
+vely"
+        exit 1;
+fi
+
+#if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ];
+if [ ! -d links/LDC2010S01/data/speech ];
+then
+        echo "Speech directories missing or not properly organised within the speech data dir"
+        echo "Typical format is LDC2010S01/data/speech"
+        exit 1;
+fi
+
+#Check the transcripts directories as well to see if they exist
+if [ ! -d links/LDC2010T04/fisher_spa_tr/data/transcripts ];
+then
+        echo "Transcript directories missing or not properly organised"
+        echo "Typical format is LDC2010T04/fisher_spa_tr/data/transcripts"
+        exit 1;
+fi
+
+#speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
+#speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
+speech=$dir/links/LDC2010S01/data/speech
+transcripts=$dir/links/LDC2010T04/fisher_spa_tr/data/transcripts
+
+#fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`
+#fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`
+fcount_s=`find ${speech} -iname '*.sph' | wc -l`
+fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`
+#TODO:it seems like not all speech files have transcripts
+#Now check if we got all the files that we needed
+#if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];
+if [ $fcount_s != 819 -o $fcount_t != 819 ];
+then
+        echo "Incorrect number of files in the data directories"
+        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively (Total = 819)"
+        echo "The transcripts should contain 819 files"
+        exit 1;
+fi
+
+if [ $stage -le 0 ]; then
+  #Gather all the speech files together to create a file list
+  #TODO: Train and test split might be required
+  (
+    #find $speech_d1 -iname '*.sph';
+    #find $speech_d2 -iname '*.sph';
+    find $speech -iname '*.sph';
+  ) > $tmpdir/train_sph.flist
+
+  #Get all the transcripts in one place
+  find $transcripts -iname '*.tdf' > $tmpdir/train_transcripts.flist
+fi
+
+if [ $stage -le 1 ]; then
+  $local/fsp_make_trans.pl $tmpdir
+  mkdir -p $dir/train_all
+  mv $tmpdir/reco2file_and_channel $dir/train_all/
+fi
+
+if [ $stage -le 2 ]; then
+  sort $tmpdir/text.1 | grep -v '((' | \
+  awk '{if (NF > 1){ print; }}' | \
+  sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \
+  sed 's:<lname>\([^<]*\)<\/lname>:\1:g' | \
+  sed 's:<lname[\/]*>::g' | \
+  sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
+  sed 's:<\s*cough[\/]*>:[noise]:g' | \
+  sed 's:<sneeze[\/]*>:[noise]:g' | \
+  sed 's:<breath[\/]*>:[noise]:g' | \
+  sed 's:<lipsmack[\/]*>:[noise]:g' | \
+  sed 's:<background>[^<]*<\/background>:[noise]:g' | \
+  sed -r 's:<[/]?background[/]?>:[noise]:g' | \
+  #One more time to take care of nested stuff
+  sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
+  sed -r 's:<[/]?laugh[/]?>:[laughter]:g' | \
+  #now handle the exceptions, find a cleaner way to do this?
+  sed 's:<foreign langenglish::g' | \
+  sed 's:</foreign::g' | \
+  sed -r 's:<[/]?foreing\s*\w*>::g' | \
+  sed 's:</b::g' | \
+  sed 's:<foreign langengullís>::g' | \
+  sed 's:foreign>::g' | \
+  sed 's:>::g' | \
+  #How do you handle numbers?
+  grep -v '()' | \
+  #Now go after the non-printable characters and multiple spaces
+  sed -r 's:¿::g'  | sed 's/^\s\s*|\s\s*$//g' | sed 's/\s\s*/ /g' > $tmpdir/text.2
+  cp $tmpdir/text.2 $dir/train_all/text
+
+  #Create segments file and utt2spk file
+  ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \
+  && echo "Error producing utt2spk file" && exit 1;
+
+  cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2;
+  $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); if ($s != $e) {print "$utt $reco $s $e\n"}; ' >$dir/train_all/segments
+
+  $utils/utt2spk_to_spk2utt.pl <$dir/train_all/utt2spk > $dir/train_all/spk2utt
+fi
+
+if [ $stage -le 3 ]; then
+  for f in `cat $tmpdir/train_sph.flist`; do
+    # convert to absolute path
+    make_absolute.sh $f
+  done > $tmpdir/train_sph_abs.flist
+
+  cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; ";  print "$1 $_"; ' > $tmpdir/sph.scp
+  cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
+  sort -k1,1 -u  > $dir/train_all/wav.scp || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl.
+  cd $cdir
+  $local/fsp_make_spk2gender.sh > $dir/train_all/spk2gender
+fi
+
+fix_data_dir.sh $dir/train_all || exit 1
+validate_data_dir.sh --no-feats $dir/train_all || exit 1
+
+echo "Fisher Spanish Data preparation succeeded."
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl
new file mode 100755
index 00000000000..538bca58981
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl
@@ -0,0 +1,85 @@
+#!/usr/bin/env perl
+#
+# Johns Hopkins University (Author : Gaurav Kumar)
+#
+# This script should be run from one directory above the current one
+#
+# Rough partitions that are needed are :
+#
+# ASR Train : 120k utterances
+# ASR tune : 20k utterances
+# ASR eval : 20k utterances
+# MT train : 105k utterances
+# MT tune : Same as the ASR eval (20k utterances)
+# MT eval : 20k utterances
+#
+# This script tries to find the closest possible matches so that conversations
+# belong in one single partition and hence there is no speaker/conversation
+# overlap between data partitions
+
+use Storable 'dclone';
+
+$textfile="data/local/data/train_all/text";
+$tmp="data/local/tmp";
+
+open(T, "<", "$textfile") || die "Can't open text file";
+
+$ongoingConv = "";
+%tmpSplits = ();
+@splitNumbers = (17455, 20000, 100000, 20000, 100000);
+$splitId = 0;
+%splits = ();
+
+while (<T>) {
+   @myStringComponents = split(/\s/);
+  @uttid = split('-', $myStringComponents[0]);
+  $currentConv = $uttid[0];
+  if ($currentConv eq $ongoingConv) {
+    # Same conversation, add to current hash
+    #print "Same conversation";
+    $tmpSplits{$ongoingConv} += 1;
+  }
+  else {
+    # New conversation intiated, first check if there are enough entries
+    # in the hash
+    #print $ongoingConv . " " . get_entries_hash(\%tmpSplits) . "\n";
+    if (get_entries_hash(\%tmpSplits) > $splitNumbers[$splitId]) {
+      print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n";
+      #$splits{$splitId} = keys %tmpSplits;
+      @newArr = keys %tmpSplits;
+      $splits{$splitId} = dclone(\@newArr);
+      %tmpSplits = ();
+      $splitId += 1;
+    }
+    $ongoingConv = $currentConv;
+    $tmpSplits{$ongoingConv} = 1;
+  }
+}
+# Put final tmpsplits in the right partition
+@newArr = keys %tmpSplits;
+$splits{$splitId} = dclone(\@newArr);
+foreach (keys  %splits) {
+  #print $_ , " ", $splits{$_}, "\n";
+}
+print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n";
+
+# Write splits to file
+foreach my $key ( keys %splits ) {
+  open(S, ">$tmp/split-$key") || die "Can't open splitfile to write";
+  foreach my $file ( @{$splits{$key}} ) {
+    print $file, "\n";
+    print S "$file\n" || die "Error writing to file";
+  }
+  close(S);
+}
+
+sub get_entries_hash() {
+  my $inputHashRef = shift;
+  $total = 0;
+  foreach (keys %{$inputHashRef})
+    {
+    $total += $inputHashRef->{$_};
+    }
+  return $total;
+}
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh
new file mode 100755
index 00000000000..15b1c0064cf
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+# Gets the unique speakers from the file created by fsp_make_trans.pl
+# Note that if a speaker appears multiple times, it is categorized as female
+
+import os
+import sys
+
+tmpFileLocation = 'data/local/tmp/spk2gendertmp'
+
+tmpFile = None
+
+try:
+     tmpFile = open(tmpFileLocation)
+except IOError:
+    print 'The file spk2gendertmp does not exist. Run fsp_make_trans.pl first?'
+
+speakers = {}
+
+for line in tmpFile:
+    comp = line.split(' ')
+    if comp[0] in speakers:
+        speakers[comp[0]] = "f"
+    else:
+        speakers[comp[0]] = comp[1]
+
+for speaker, gender in speakers.iteritems():
+    print speaker + " " + gender
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl
new file mode 100755
index 00000000000..8c3f74e3917
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl
@@ -0,0 +1,81 @@
+#!/usr/bin/env perl
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+use utf8;
+use File::Basename;
+($tmpdir)=@ARGV;
+#$tmpdir='../data/local/tmp';
+$trans="$tmpdir/train_transcripts.flist";
+$reco="$tmpdir/reco2file_and_channel";
+open(T, "<", "$trans") || die "Can't open transcripts file";
+open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!";
+open(O, ">$tmpdir/text.1") || die "Can't open text file for writing";
+open(G, ">$tmpdir/spk2gendertmp") || die "Can't open the speaker to gender map file";
+binmode(O, ":utf8");
+while (<T>) {
+  $file = $_;
+  m:([^/]+)\.tdf: || die "Bad filename $_";
+  $call_id = $1;
+  print R "$call_id-A $call_id A\n";
+  print R "$call_id-B $call_id B\n";
+  open(I, "<$file") || die "Opening file $_";
+  binmode(I, ":utf8");
+  # Get rid of header sections first
+  foreach ( 0..2 ) {
+    $tmpLine = <I>;
+  }
+  #Now read each line and extract information
+  while (<I>) {
+    #20051017_215732_274_fsp.sph     1       0.0     0.909856781803  Audrey  female  native   <foreign lang="English"> Audrey </foreign>     0       0       -1
+    chomp;
+    my @stringComponents = split(/\t/);
+
+    #Check number of components in this array
+    if ((scalar @stringComponents) >= 11) {
+      $start = sprintf("%06d", $stringComponents[2] * 100);
+      $end = sprintf("%06d", $stringComponents[3] * 100);
+      length($end) > 6 && die "Time too long $end in $file";
+      $side = $stringComponents[1] ? "B" : "A";
+      $words = $stringComponents[7];
+      $utt_id = "${call_id}-$side-$start-$end";
+      $speaker_id = "${call_id}-$side";
+      $gender = "m";
+      if  ($stringComponents[5] == "female") {
+        $gender = "f";
+      }
+      print G "$speaker_id $gender\n" || die "Error writing to speaker2gender file";
+      $words =~ s:</:lendarrow:g;
+      $words =~ s/</larrow/g;
+      $words =~ s/>/rarrow/g;
+      $words =~ s/[[:punct:]]//g;
+      $words =~ s/larrow/</g;
+      $words =~ s/rarrow/>/g;
+      $words =~ s:lendarrow:</:g;
+      $words =~ s/Á/á/g;
+      $words =~ s/Í/í/g;
+      $words =~ s/Ó/ó/g;
+      $words =~ s/Ú/ú/g;
+#      $words =~ s/ì/í/g;
+#      $words =~ s/è/é/g;
+#      $words =~ s/¡/i/g;
+#      $words =~ s/J/J/g;
+#      $words =~ s/S/S/g;
+#      $words =~ s/à/á/g;
+      $words =~ s/¨//g;
+      $words =~ s/·//g;
+      $words =~ s/´//g;
+      $words =~ s/N/n/g;
+#      $words =~ s/2//g;
+      $words = lc($words);
+#      $words =~ s:ü([eiéí]):w\1:g;
+#      $words =~ s:ü:u:g;
+#      $words =~ s:ñ:N:g;
+      print O "$utt_id $words\n" || die "Error writing to text file";
+    }
+  }
+  close(I)
+}
+close(T);
+close(R);
+close(O);
+close(G);
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh
new file mode 100755
index 00000000000..7b2de2db392
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh
@@ -0,0 +1,142 @@
+#!/usr/bin/env bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+. ./path.sh
+
+#First get the list of unique words from our text file
+if [ $# -lt 1 ]; then
+  echo 'Usage fsp_prepare_dict.sh lexicon'
+  exit 1;
+fi
+
+stage=0
+
+dir=`pwd`/data/local/dict
+datadir=`pwd`/data/local/data/train_all
+mkdir -p $dir
+local=`pwd`/local
+utils=`pwd`/utils
+tmpdir=`pwd`/data/local/tmp
+lexicon=$1
+
+#Get all unique words, remove punctuation.
+if [ $stage -le 0 ]; then
+  cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords
+  if [ ! -f "${tmpdir}/es_wordlist.json" ]; then
+    echo "Could not find the large collection of Spanish words es_wordlist.json"
+    echo "Trying to download it via wget"
+
+    if ! which wget >&/dev/null; then
+      echo "This script requires you to first install wget"
+      exit 1;
+    fi
+
+    cwd=`pwd`
+    cd $tmpdir
+    wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz
+
+    if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then
+      echo "Download of the large Spanish word list failed"
+      exit 1;
+    fi
+
+    tar -xovzf es_wordlist.json.tgz || exit 1;
+    cd $cwd
+  fi
+
+  # Merge with gigaword corpus
+  $local/merge_lexicons.py ${tmpdir} ${lexicon}
+  mv $tmpdir/uniquewords $tmpdir/uniquewords.small
+  mv $tmpdir/uniquewords64k $tmpdir/uniquewords
+fi
+
+#Then get the list of phones form basic_rules in the lexicon folder
+if [ $stage -le 1 ]; then
+  if [ ! -d "$lexicon/callhome_spanish_lexicon_970908" ]; then
+    echo "Could not find folder callhome_spanish_lexicon_970908 in the lexicon folder"
+    exit 1;
+  fi
+
+  # This is a preliminary attempt to get the unique phones from the LDC lexicon
+  # This will be extended based on our lexicon later
+  perl $local/find_unique_phones.pl $lexicon/callhome_spanish_lexicon_970908 $tmpdir
+
+fi
+
+#Get pronunciation for each word using the spron.pl file in the lexicon folder
+if [ $stage -le 2 ]; then
+  #cd $lexicon/callhome_spanish_lexicon_970908
+  # Replace all words for which no pronunciation was generated with an orthographic
+  # representation
+  cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \
+    | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \
+    | awk -F '[/][/]' '{print $1}' \
+    > $tmpdir/lexicon_raw
+fi
+
+#Break the pronunciation down according to the format required by Kaldi
+if [ $stage -le 3 ]; then
+  # Creates a KALDI compatible lexicon, and extends the phone list
+  perl $local/isolate_phones.pl $tmpdir
+  cat $tmpdir/phones_extended | sort | awk '{if ($1 != "") {print;}}' > $tmpdir/phones_extended.1
+  mv $tmpdir/phones $tmpdir/phones.small
+  mv $tmpdir/phones_extended.1 $tmpdir/phones
+  sort $tmpdir/phones -o $tmpdir/phones
+  paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | sed -r 's:(\S+)\s#.*:\1 oov:g' > $tmpdir/lexicon.1
+  #paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | grep -v '#' > $tmpdir/lexicon.1
+fi
+
+if [ $stage -le 4 ]; then
+  # silence phones, one per line.
+  for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt
+  echo sil > $dir/optional_silence.txt
+
+  # An extra question will be added by including the silence phones in one class.
+  cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > \
+  $dir/extra_questions.txt || exit 1;
+
+  # Remove [] chars from phones
+  cat $tmpdir/phones | awk '{if ($1 != "_" && $1 != "[" && $1 != "]") {print;}}' > $tmpdir/phones.1
+  rm $tmpdir/phones
+  mv $tmpdir/phones.1 $tmpdir/phones
+  cp $tmpdir/phones $dir/nonsilence_phones.txt
+
+  if [ -f $tmpdir/lexicon.2 ]; then rm $tmpdir/lexicon.2; fi
+  cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
+
+  # Add prons for laughter, noise, oov
+  for w in `grep -v sil $dir/silence_phones.txt`; do
+    sed -i "/\[$w\]/d" $tmpdir/lexicon.2
+  done
+
+  for w in `grep -v sil $dir/silence_phones.txt`; do
+    echo "[$w] $w"
+  done | cat - $tmpdir/lexicon.2  > $tmpdir/lexicon.3 || exit 1;
+
+  cat $tmpdir/lexicon.3  \
+   <( echo "mm m"
+      echo "<unk> oov" ) > $tmpdir/lexicon.4
+
+  # From the lexicon remove _ from the phonetic representation
+  cat $tmpdir/lexicon.4 | sed 's:\s_::g' > $tmpdir/lexicon.5
+
+  cp "$tmpdir/lexicon.5" $dir/lexicon.txt
+
+  cat $datadir/text  | \
+  awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
+  sort -nr > $tmpdir/word_counts
+
+  awk '{print $1}' $dir/lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $tmpdir/word_counts > $tmpdir/oov_counts.txt
+  echo "*Highest-count OOVs are:"
+  head -n 20 $tmpdir/oov_counts.txt
+fi
+
+$utils/validate_dict_dir.pl $dir
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh
new file mode 100755
index 00000000000..cebf3b222ab
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+# To be run from one level above this directory
+# Generate the text for the LM training
+tmp_dir=data/local/tmp
+train_all=data/local/data/train_all
+
+if [ $# -lt 1 ]; then
+  echo "Specify the location of the split files"
+  exit 1;
+fi
+
+splitFile=$1
+split=train
+# Train only
+if [ -d $tmp_dir/$split ]; then
+  rm -r $tmp_dir/$split
+fi
+cp -r $train_all $tmp_dir/$split
+
+awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \
+$splitFile/$split $train_all/segments > $tmp_dir/$split/segments
+
+n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $tmp_dir/$split/segments | sort | uniq | wc -l`
+
+echo "$n conversations left in split $split"
+
+utils/fix_data_dir.sh $tmp_dir/$split
+# There is no feature file yet, use --no-feats switch
+utils/validate_data_dir.sh --no-feats $tmp_dir/$split
+
+# Now use this training text
+
+text=$tmp_dir/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# fisher_data_prep.sh and fisher_prepare_dict.sh
+# It takes as input the files
+#data/train_all/text
+#data/local/dict/lexicon.txt
+
+dir=`pwd`/data/local/lm
+mkdir -p $dir
+export LC_ALL=C # You'll get errors about things being not sorted, if you
+# have a different locale.
+export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
+( # First make sure the kaldi_lm toolkit is installed.
+ cd ../../../tools || exit 1;
+ if [ -d kaldi_lm ]; then
+   echo Not installing the kaldi_lm toolkit since it is already there.
+ else
+   echo Downloading and installing the kaldi_lm tools
+   if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
+   fi
+   tar -xvzf kaldi_lm.tar.gz || exit 1;
+   cd kaldi_lm
+   make || exit 1;
+   echo Done making the kaldi_lm tools
+ fi
+) || exit 1;
+
+mkdir -p $dir
+
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<unk> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <unk> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<unk>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+
+exit 0
+
+echo "Baseline"
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+heldout_sent=158126 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $sdir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $sdir/train
+
+cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+  -map-unk "<unk>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
+
+# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
+# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
+# 0 zeroprobs, logprob= -165170 ppl= 71.7609 ppl1= 123.258
+
+
+# Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <unk>.
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
+
+# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
+# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
+# 0 zeroprobs, logprob= -164990 ppl= 71.4278 ppl1= 122.614
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py
new file mode 100755
index 00000000000..9c590635562
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+# Extracts one best output for a set of files
+# The list of files in the conversations for which 1 best output has to be extracted
+# words.txt
+
+import os
+import sys
+
+scoringFile = "exp/sgmm2x_6a_mmi_b0.2/decode_test_it4/scoring/10.tra"
+wordsFile = open('exp/sgmm2x_6a/graph/words.txt')
+conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/test')
+oneBestTmp = 'exp/sgmm2x_6a_mmi_b0.2/one-best/asr-test'
+provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.test', 'w+')
+timLocation = '/export/a04/gkumar/corpora/fishcall/fisher/tim'
+
+def findTranscription(timeDetail):
+  file1 = open(scoringFile)
+  for line in file1:
+    lineComp = line.split()
+    if lineComp[0] == timeDetail:
+      return " ".join(lineComp[1:])
+  # No result found
+  return -1
+
+words = {}
+
+# Extract word list
+for line in wordsFile:
+  lineComp = line.split()
+  words[int(lineComp[1])] = lineComp[0].strip()
+
+# Now read list of files in conversations
+fileList = []
+for line in conversationList:
+    line = line.strip()
+    line = line[:-4]
+    fileList.append(line)
+
+# Now get timing information to concatenate the ASR outputs
+if not os.path.exists(oneBestTmp):
+  os.makedirs(oneBestTmp)
+
+for item in fileList:
+  timingFile = open(timLocation + '/' + item + '.es')
+  newFile = open(oneBestTmp + '/' + item + '.es', 'w+')
+  for line in timingFile:
+    timeInfo = line.split()
+    mergedTranslation = ""
+    for timeDetail in timeInfo:
+      #Locate this in ASR dev/test, this is going to be very slow
+      tmp = findTranscription(timeDetail)
+      if tmp != -1:
+        mergedTranslation = mergedTranslation + " " + tmp
+    mergedTranslation = mergedTranslation.strip()
+    transWords = [words[int(x)] for x in mergedTranslation.split()]
+    newFile.write(" ".join(transWords) + "\n")
+    provFile.write(" ".join(transWords) + "\n")
+
+  newFile.close()
+provFile.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl
new file mode 100755
index 00000000000..ca5b2a46f8e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/env perl
+
+# Nagendra Kumar Goel
+
+# This takes two arguments:
+# 1) Pocolm training output folder
+# 2) rnnlm weights file name (for output)
+
+use POSIX;
+use List::Util qw[min max];
+
+if (@ARGV != 2) {
+  die "Usage: get_data_weights.pl <pocolm-folder> <output-file>\n";
+}
+
+$pdir = shift @ARGV;
+$out = shift @ARGV;
+
+open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters";
+open(N, "<$pdir/names") || die "Could not open $pdir/names"  ;
+open(O, ">$out")  || die "Could not open $out for writing" ;
+
+my %scores = ();
+
+while(<N>) {
+    @n = split(/\s/,$_);
+    $name = $n[1];
+    $w = <P>;
+    @w = split(/\s/,$w);
+    $weight = $w[1];
+    $scores{$name} = $weight;
+}
+
+$min = min(values %scores);
+
+for(keys %scores) {
+    $weightout = POSIX::ceil($scores{$_} / $min);
+    print O "$_\t1\t$weightout\n";
+}
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py
new file mode 100755
index 00000000000..5430c18bb5b
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+# Extracts one best output for a set of files
+# The list of files in the conversations for which 1 best output has to be extracted
+# words.txt
+
+from __future__ import print_function
+import os
+import sys
+import subprocess
+
+latticeLocation = 'latjosh-bmmi/lattices-pushed/'
+
+tmpdir = 'data/local/data/tmp/bmmi-t/lattmp'
+invalidplfdir = 'data/local/data/tmp/bmmi-t/invalidplf'
+symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt'
+
+conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/test')
+provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/asr.test.plf', 'w+')
+invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/invalidPLF', 'w+')
+blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/blankPLF', 'w+')
+rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/removeLines', 'w+')
+
+if not os.path.exists(tmpdir):
+    os.makedirs(tmpdir)
+if not os.path.exists(invalidplfdir):
+    os.makedirs(invalidplfdir)
+else:
+    os.system("rm " + invalidplfdir + "/*")
+
+def latticeConcatenate(lat1, lat2):
+    '''
+    Concatenates lattices, writes temporary results to tmpdir
+    '''
+    if lat1 == "":
+        os.system('rm ' + tmpdir + '/tmp.lat')
+        return lat2
+    else:
+        proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')])
+        proc.wait()
+        return tmpdir + '/tmp.lat'
+
+
+def findLattice(timeDetail):
+    '''
+    Finds the lattice corresponding to a time segment
+    '''
+    if os.path.isfile(latticeLocation + timeDetail + '.lat'):
+        return latticeLocation + timeDetail + '.lat'
+    else:
+        return -1
+
+
+# Now read list of files in conversations
+fileList = []
+for line in conversationList:
+    line = line.strip()
+    line = line[:-4]
+    fileList.append(line)
+
+# IN what order were the conversations added to the spanish files?
+# Now get timing information to concatenate the ASR outputs
+
+lineNo = 1
+for item in fileList:
+    timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es')
+    for line in timingFile:
+        timeInfo = line.split()
+
+        # For utterances that are concatenated in the translation file,
+        # the corresponding FSTs have to be translated as well
+        mergedTranslation = ""
+        for timeDetail in timeInfo:
+            tmp = findLattice(timeDetail)
+            if tmp != -1:
+                # Concatenate lattices
+                mergedTranslation = latticeConcatenate(mergedTranslation, tmp)
+
+        print(mergedTranslation)
+        if mergedTranslation != "":
+
+            # Sanjeev's Recipe : Remove epsilons and topo sort
+            finalFST = tmpdir + "/final.fst"
+            os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST)
+
+            # Now convert to PLF
+            proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable +  ' ' + finalFST, stdout=subprocess.PIPE, shell=True)
+            PLFline = proc.stdout.readline()
+            finalPLFFile = tmpdir + "/final.plf"
+            finalPLF = open(finalPLFFile, "w+")
+            finalPLF.write(PLFline)
+            finalPLF.close()
+
+            # now check if this is a valid PLF, if not write it's ID in a
+            # file so it can be checked later
+            proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
+            line = proc.stdout.readline()
+            print("{} {}".format(line, lineNo))
+            if line.strip() != "PLF format appears to be correct.":
+                os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
+                invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
+                rmLines.write("{}\n".format(lineNo))
+            else:
+                provFile.write(PLFline)
+        else:
+            blankPLF.write(timeInfo[0] + "\n")
+            rmLines.write("{}\n".format(lineNo))
+        # Now convert to PLF
+        lineNo += 1
+
+provFile.close()
+invalidPLF.close()
+blankPLF.close()
+rmLines.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh
new file mode 100755
index 00000000000..451a7c529fb
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Gets lattice oracles
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+if [ $# -lt 3 ]; then
+    echo "Specify lattice dir, symbol table and text file for partition"
+    exit 1;
+fi
+
+latticeDir=$1
+textFile=$3
+symTable=$2
+oracleDir=$latticeDir/oracle
+
+echo $latticeDir
+echo $oracleDir
+
+. ./path.sh
+
+if [ ! -f $textFile -o ! -f $symTable -o ! -d $latticeDir ]; then
+    echo "Required files not found"
+    exit 1;
+fi
+
+mkdir -p $oracleDir
+
+cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | \
+    utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \
+    $KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log
+
+sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl
new file mode 100755
index 00000000000..0366dcdacb0
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl
@@ -0,0 +1,66 @@
+#!/usr/bin/env perl
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+# Once the phonetic representation for words is generated by the LDC lexicon
+# This script converts them into a KALDI compatible format
+# In addition, it extends the list of phonemes to consider based on
+# orthograhic representations of those words which do not have stressed vowels
+
+use utf8;
+
+($tmpdir)=$ARGV[0];
+open(L, "<", "$tmpdir/lexicon_raw") || die "Can't open raw lexicon";
+open(P, "<" , "$tmpdir/phones") || die "Can't open phone file";
+open(I, ">$tmpdir/lexicon_one_column") || die "Can't open text file for writing";
+open(E, ">$tmpdir/phones_extended") || die "Can't open ex-phone file for writing";
+binmode(P, ":utf8");
+binmode(L, ":utf8");
+binmode(I, ":utf8");
+binmode(E, ":utf8");
+
+#Get all phones
+my %phones = qw();
+while (<P>) {
+  chomp;
+  $phones{$_} = 1;
+}
+
+print @phones;
+
+while (<L>) {
+  if (substr($_, 0, 1) eq "#") {
+    print I $_;
+    next;
+  }
+  $len = length;
+  $current = 0;
+  $splitWord = "";
+  while ($current < $len) {
+    #First check for two char codes
+    $currentChar2 = substr($_, $current, 2);
+    $currentChar1 = substr($_, $current, 1);
+    if (exists($phones{$currentChar2})) {
+      $splitWord = $splitWord . " " . $currentChar2;
+      $current = $current + 2;
+    }
+    else {
+      # Check if this phone exists
+      if (!exists($phones{$currentChar1})) {
+        $phones{$currentChar1} = 1
+      }
+      $splitWord = $splitWord . " " . $currentChar1;
+      $current = $current + 1;
+    }
+  }
+  $splitWord =~ s/^\s*(.*?)\s*$/$1/;
+  print I $splitWord, "\n";
+}
+
+# Now write the phones to the extended phone file
+foreach my $key (keys %phones) {
+    print E $key, "\n";
+}
+
+close(L);
+close(P);
+close(I);
+close(E);
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh
new file mode 100755
index 00000000000..bbe0af5810c
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# Author : Gaurav Kumar, Johns Hopkins University
+# Creates OpenFST lattices from Kaldi lattices
+# This script needs to be run from one level above this directory
+
+. ./path.sh
+
+if [ $# -lt 3 ]; then
+  echo "Enter the latdir (where the lattices will be put), the decode dir containing lattices and the acoustic scale"
+  exit 1
+fi
+
+prunebeam=2
+
+latdir=$1
+decode_dir=$2
+acoustic_scale=$3
+#latdir="latjosh-2-callhome"
+#decode_dir=exp/tri5a/decode_$partition
+#acoustic_scale=0.077
+
+stage=0
+
+if [ -d $decode_dir ]
+then
+  # TODO:Add scaling factor for weights, how?
+  rawLatDir="lattices"
+  compiledLatDir="lattices-bin"
+  preplfLatDir="lattices-pushed"
+
+  mkdir -p $latdir
+  mkdir -p $latdir/$rawLatDir
+  mkdir -p $latdir/$compiledLatDir
+  mkdir -p $latdir/$preplfLatDir
+
+  for l in $decode_dir/lat.*.gz
+  do
+    (
+    # Extract file name and unzip the file first
+    bname=${l##*/}
+    bname="$latdir/${bname%.gz}"
+    gunzip -c $l > "$bname.bin"
+
+    if [ $stage -le 0 ]; then
+
+      # Now copy into ark format
+      $KALDI_ROOT/src/latbin/lattice-copy ark:$bname.bin ark,t:- > "$bname.raw"
+
+      # Prune lattices
+      $KALDI_ROOT/src/latbin/lattice-prune --acoustic-scale=$acoustic_scale --beam=$prunebeam ark:"$bname.raw" ark:"$bname.pruned"
+
+      # Convert to an openfst compatible format
+      $KALDI_ROOT/src/latbin/lattice-to-fst --lm-scale=1.0 --acoustic-scale=$acoustic_scale ark:$bname.pruned ark,t:$bname.ark.fst
+
+    fi
+
+    if [ $stage -le 1 ]; then
+      fileName=""
+      fileLine=0
+
+      while read line; do
+        if [ $fileLine = 0 ]; then
+          fileName="$line"
+          fileLine=1
+          continue
+        fi
+        if [ -z "$line" ]; then
+          fileLine=0
+          continue
+        fi
+        # Replace laugh, unk, oov, noise with eps
+        echo "$line" | awk '{if ($3 == 2038 || $3 == 2039 || $3 == 2040) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat"
+      done < $bname.ark.fst
+      echo "Done isolating lattices"
+    fi
+    ) &
+  done
+  wait
+  rm $latdir/*.bin
+  rm $latdir/*.pruned
+
+
+  if [ $stage -le 2 ]; then
+    #Compile lattices
+    for l in $latdir/$rawLatDir/*.lat
+    do
+      (
+      # Arc type needs to be log
+      bname=${l##*/}
+      fstcompile --arc_type=log $latdir/$rawLatDir/$bname $latdir/$compiledLatDir/$bname
+      ) &
+    done
+    wait
+    echo "Done compiling lattices."
+  fi
+
+  if [ $stage -le 3 ]; then
+    #Sanjeev's Recipe for creating valid PLF compatible FSTs"
+    # Create a dummy FST with one state and no arcs first
+    echo 0 | fstcompile --arc_type=log - $latdir/$preplfLatDir/dummy.fst
+    # Push Lattice weights towards initial state
+    for l in $latdir/$compiledLatDir/*.lat
+    do
+      (
+      bname=${l##*/}
+      fstrmepsilon $latdir/$compiledLatDir/$bname | \
+        fstpush --push_weights --remove_total_weight - | \
+        # Do not topo sort here, do it before converting into PLF
+      # Sanjeev's Recipe : Concatenate with dummy FST
+      fstconcat - $latdir/$preplfLatDir/dummy.fst | \
+        fstreverse - | \
+        fstrmepsilon - | \
+        fstreverse - $latdir/$preplfLatDir/$bname
+      ) &
+    done
+    wait
+    # Let's take a moment to thank the dummy FST for playing its
+    # part in this process. However, it has to go now.
+    rm $latdir/$preplfLatDir/dummy.fst
+    echo "Done performing fst push (initial state)"
+  fi
+else
+  echo "Complete training and decoding first"
+fi
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py
new file mode 100755
index 00000000000..94546dc44c3
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc., Avaaya
+
+# Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
+from __future__ import print_function
+import sys
+import re
+import json
+import codecs
+import operator
+
+wordlimit = 64000
+tmpdir = sys.argv[1]
+ldc_lexicon = sys.argv[2]
+uw_fisher = tmpdir + "/uniquewords"
+uw_gigaword = tmpdir + "/es_wordlist.json"
+uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
+
+filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]')
+merged_lexicon = []
+# All three lexicons are in different formats
+# First add the data from lexicon_fisher (A) into the dictionary
+fisher = codecs.open(uw_fisher, encoding='utf-8')
+for line in fisher:
+    merged_lexicon.append(line.strip())
+fisher.close()
+
+print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon)))
+
+# Now add data from the LDC lexicon
+ldc = codecs.open(uw_LDC, encoding='iso-8859-1')
+for line in ldc:
+    entries = line.strip().split('\t')
+    if entries[0].lower() not in merged_lexicon:
+        merged_lexicon.append(entries[0].lower())
+
+print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon)))
+
+# Finally add the gigaword data
+gigaword = json.load(open(uw_gigaword))
+gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1)))
+
+for item in gigaword:
+    # We need a maximum of wordlimit words in the lexicon
+    if len(merged_lexicon) == wordlimit:
+        break
+
+    if item[0].lower() not in merged_lexicon:
+        merged_lexicon.append(item[0].lower())
+
+print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon)))
+
+# Now write the uniquewords to a file
+lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+')
+ltuples = sorted(merged_lexicon)
+
+for item in ltuples:
+    if not item==u'ñ' and not re.search(filtered_letters, item):
+        lf.write(item + "\n")
+
+lf.close()
+
+print("Finshed writing unique words")
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh
new file mode 100755
index 00000000000..a95893f698a
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+currentJob=0
+
+dir=/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/exp/sgmm2x_6a_denlats
+
+for f in $dir/.done.*; do
+    d=`echo ${f##*/} | awk 'BEGIN {FS="."} {print $3}'`
+    if [ $d -gt $currentJob ]; then
+        currentJob=$d
+    fi
+done
+
+currentJob=$((currentJob+1))
+
+echo Currently processing job : $currentJob
+
+for i in $(seq 210); do
+    job[$i]=$i
+done
+
+dir=/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/exp/sgmm2x_6a_denlats/log/$currentJob/q
+
+for f in $dir/done.*; do
+    d=`echo ${f##*/} | awk 'BEGIN {FS="."} {print $3}'`
+    unset job[$d]
+done
+
+echo sub-splits left : ${#job[@]}
+echo ${job[@]}
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..cc9de4d26c5
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
+
+
+stage=7
+nj=30
+train_set=train   # you might set this to e.g. train.
+test_sets="test dev"
+gmm=tri5a                # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
+                         # in the tedlium recip it's _cleaned).
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 7 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 7."
+  exit 1
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+
+fi
+
+if [ $stage -le 11 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/wsj-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+  # We now extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+
+if [ $stage -le 13 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 15 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
new file mode 100755
index 00000000000..a3b2d77d860
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+
+# this script generates Pocolm-estimated language models with various
+# data sources in data/text folder and places the output in data/lm.
+
+set -euo pipefail
+
+. ./path_venv.sh
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+
+wordlist=None
+num_word=100000
+lm_dir=
+arpa_dir=
+textdir=
+max_memory='--max-memory=8G'
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+# If you do not want to set memory limitation for "sort", you can use
+#max_memory=
+# Choices for the max-memory can be:
+# 1) integer + 'K', 'M', 'G', ...
+# 2) integer + 'b', meaning unit is byte and no multiplication
+# 3) integer + '%', meaning a percentage of memory
+# 4) integer, default unit is 'K'
+
+fold_dev_opt=
+# If you want to fold the dev-set in to the 'swbd1' set to produce the final
+# model, un-comment the following line.  For use in the Kaldi example script for
+# ASR, this isn't suitable because the 'dev' set is the first 10k lines of the
+# switchboard data, which we also use as dev data for speech recognition
+# purposes.
+#fold_dev_opt="--fold-dev-into=swbd1"
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 3-gram model running with train_lm.py.
+# the dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.091,0.867,0.753,0.275,0.100,0.018,0.902,0.371,0.183,0.070"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+limit_unk_history_opt=
+# If you want to limit the left of <unk> in the history of a n-gram
+# un-comment the following line
+#limit_unk_history_opt="--limit-unk-history=true"
+
+for order in 3; do
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  lm_name="${num_word}_${order}"
+  min_counts=''
+  # Note: the following might be a more reasonable setting:
+  # min_counts='fisher=2 swbd1=1'
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \
+              --min-counts=${min_counts} \
+              --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \
+              ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  mkdir -p ${arpa_dir}
+  format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz
+
+  # example of pruning.  note: the threshold can be less than or more than one.
+  get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  for threshold in 1.0 2.0 4.0; do
+    pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm
+    prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3
+    get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+    format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz
+
+  done
+
+  # example of pruning by size.
+  size=1000000
+  pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm
+  prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes'
+  get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz
+
+done
+
+# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 )
+
+# the following does does some self-testing, including
+# that the computed derivatives are accurate.
+# local/self_test.sh
+
+# perplexities from pocolm-estimated language models with pocolm's interpolation
+# method from orders 3, 4, and 5 are:
+# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689)
+# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797)
+# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181)
+
+# note, the perplexities from pocolm-estimated language models with SRILM's
+# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh),
+# 78.8449 and 75.2202 respectively.
+
+# note, the perplexities from SRILM-estimated language models with SRILM's
+# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh),
+# 78.9056 and 75.5528 respectively.
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py b/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py
new file mode 100755
index 00000000000..5c68e1204b2
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+# Processes lattice oracles
+
+import os
+import sys
+
+oracleDir = "exp/tri5a/decode_callhome_train/oracle"
+wordsFile = open('exp/sgmm2x_6a/graph/words.txt')
+conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/train')
+oracleTmp = 'exp/tri5a/one-best/oracle-ch-train'
+provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/oracle.train', 'w+')
+timLocation = '/export/a04/gkumar/corpora/fishcall/callhome/tim'
+
+def findTranscription(timeDetail):
+  file1 = open(oracleDir + "/oracle.tra")
+  for line in file1:
+    lineComp = line.split()
+    if lineComp[0] == timeDetail:
+      return " ".join(lineComp[1:])
+  # No result found
+  return -1
+
+words = {}
+
+# Extract word list
+for line in wordsFile:
+  lineComp = line.split()
+  words[int(lineComp[1])] = lineComp[0].strip()
+
+# Now read list of files in conversations
+fileList = []
+for line in conversationList:
+    line = line.strip()
+    line = line[:-4]
+    fileList.append(line)
+
+# IN what order were the conversations added to the spanish files?
+# TODO: Make sure they match the order in which these english files are being written
+
+# Now get timing information to concatenate the ASR outputs
+if not os.path.exists(oracleTmp):
+  os.makedirs(oracleTmp)
+
+#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+')
+for item in fileList:
+  timingFile = open(timLocation + '/' + item + '.es')
+  newFile = open(oracleTmp + '/' + item + '.es', 'w+')
+  for line in timingFile:
+    timeInfo = line.split()
+    mergedTranslation = ""
+    for timeDetail in timeInfo:
+      #Locate this in ASR dev/test, this is going to be very slow
+      tmp = findTranscription(timeDetail)
+      if tmp != -1:
+        mergedTranslation = mergedTranslation + " " + tmp
+    mergedTranslation = mergedTranslation.strip()
+    transWords = [words[int(x)] for x in mergedTranslation.split()]
+    newFile.write(" ".join(transWords) + "\n")
+    provFile.write(" ".join(transWords) + "\n")
+
+  newFile.close()
+provFile.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh
new file mode 100755
index 00000000000..1b54b304e50
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+. ./cmd.sh
+
+for iter in 1 2 3 4; do
+      steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
+      --transform-dir exp/tri5a/decode_test data/lang data/test exp/sgmm2x_6a/decode_test_fmllr \
+      exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it$iter &
+done
+
+
+for iter in 1 2 3 4; do
+      steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
+      --transform-dir exp/tri5a/decode_dev data/lang data/dev exp/sgmm2x_6a/decode_dev_fmllr \
+      exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it$iter &
+done
+
+
+for iter in 1 2 3 4; do
+      steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
+      --transform-dir exp/tri5a/decode_dev2 data/lang data/dev2 exp/sgmm2x_6a/decode_dev2_fmllr \
+      exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it$iter &
+done
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
new file mode 100755
index 00000000000..aa06fdbb293
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2015  Guoguo Chen
+#           2017  Hainan Xu
+#           2017  Xiaohui Zhang
+
+# This script trains LMs on the swbd LM-training data.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0.
+# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71
+# Dev objf:   -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91
+
+
+dir=Spanish_gigawrd/rnnlm
+pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned
+wordslist=
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=0
+train_stage=-30
+text=Spanish_gigawrd/text_lm
+text_dir=Spanish_gigawrd/text_lm
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+mkdir -p $dir/config
+set -e
+
+for f in $text/dev.txt; do
+    [ ! -f $f ] && \
+	echo "$0: expected file $f to exist;" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+    if [ -f $text_dir/unigram_weights ] ; then
+	mv $text_dir/unigram_weights $pocolm_dir/
+    fi
+    cp $wordslist $dir/config/words.txt
+    n=`cat $dir/config/words.txt | wc -l`
+    echo "<brk> $n" >> $dir/config/words.txt
+
+    # words that are not present in words.txt but are in the training or dev data, will be
+    # mapped to <SPOKEN_NOISE> during training.
+    echo "<unk>" >$dir/config/oov.txt
+    local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt 
+    rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+				 --unk-word="<unk>" \
+				 --data-weights-file=$dir/config/data_weights.txt \
+				 $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+    
+      # choose features
+      rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+			       --use-constant-feature=true \
+			       --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+			       $dir/config/words.txt > $dir/config/features.txt
+fi
+
+if [ $stage -le 1 ]; then
+        cat <<EOF >$dir/config/xconfig 
+	input dim=$embedding_dim name=input
+	relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+	fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+	relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+	fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+	relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+	output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+	rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+    rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+    rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \
+			 --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir
+fi
+
+exit 0
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh
new file mode 100755
index 00000000000..4a26f6857b8
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -euo pipefail
+
+punctuation_symbols=( "," "\"" "\`" "\:" "(" ")" "-" ";" "?" "!" "/" "_" "{" "}" "*" )
+
+config=$1
+path_prefix=$2
+data=$3
+job=$4
+dir=$5
+
+substitute_arg=""
+num_syms=0
+
+for i in "${punctuation_symbols[@]}"; do
+    symbol=${punctuation_symbols[${num_syms}]}
+    if [ $num_syms -eq 0 ]; then
+	substitute_arg="sed 's:${i}: :g'"
+    else
+	substitute_arg=$substitute_arg" | sed 's:${i}: :g'"
+    fi
+    substitute_arg=$substitute_arg" |sed 's:${i}$: :g' | sed 's:^${i}: :g'"
+    num_syms=$((num_syms+1))
+done
+mkdir -p $dir/normalize/$job
+echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh
+bash $dir/normalize/$job/substitute.sh | \
+    sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \
+    sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç'  > $dir/normalize/$job/text
+normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh
new file mode 100755
index 00000000000..9148b1f1171
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+# This is as run_sgmm2.sh but excluding the "speaker-dependent weights",
+# so not doing the symmetric SGMM.
+
+. ./cmd.sh
+
+## SGMM on top of LDA+MLLT+SAT features.
+if [ ! -f exp/ubm6a/final.mdl ]; then
+  steps/train_ubm.sh --silence-weight 0.5 --cmd "$train_cmd" 800 data/train data/lang exp/tri5a_ali exp/ubm6a || exit 1;
+fi
+# Double the number of SAT states : sanjeev
+steps/train_sgmm2.sh  --spk-dep-weights false --cmd "$train_cmd" 10000 120000 \
+  data/train data/lang exp/tri5a_ali exp/ubm6a/final.ubm exp/sgmm2x_6a || exit 1;
+
+utils/mkgraph.sh data/lang_test exp/sgmm2x_6a exp/sgmm2x_6a/graph || exit 1;
+
+steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
+  --transform-dir exp/tri5a/decode_dev  exp/sgmm2x_6a/graph data/dev exp/sgmm2x_6a/decode_dev || exit 1;
+
+steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
+  --transform-dir exp/tri5a/decode_dev  exp/sgmm2x_6a/graph data/dev exp/sgmm2x_6a/decode_dev_fmllr || exit 1;
+
+steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
+  --transform-dir exp/tri5a/decode_test  exp/sgmm2x_6a/graph data/test exp/sgmm2x_6a/decode_test || exit 1;
+
+steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
+  --transform-dir exp/tri5a/decode_test  exp/sgmm2x_6a/graph data/test exp/sgmm2x_6a/decode_test_fmllr || exit 1;
+
+steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
+  --transform-dir exp/tri5a/decode_dev2  exp/sgmm2x_6a/graph data/dev2 exp/sgmm2x_6a/decode_dev2 || exit 1;
+
+steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
+  --transform-dir exp/tri5a/decode_dev2  exp/sgmm2x_6a/graph data/dev2 exp/sgmm2x_6a/decode_dev2_fmllr || exit 1;
+
+ #  Now we'll align the SGMM system to prepare for discriminative training.
+ steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri5a \
+    --use-graphs true --use-gselect true data/train data/lang exp/sgmm2x_6a exp/sgmm2x_6a_ali || exit 1;
+ steps/make_denlats_sgmm2.sh --nj 30 --sub-split 210 --cmd "$decode_cmd" --transform-dir exp/tri5a \
+   data/train data/lang exp/sgmm2x_6a_ali exp/sgmm2x_6a_denlats
+ steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri5a --boost 0.2 \
+   data/train data/lang exp/sgmm2x_6a_ali exp/sgmm2x_6a_denlats exp/sgmm2x_6a_mmi_b0.2
+
+ for iter in 1 2 3 4; do
+  steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
+    --transform-dir exp/tri5a/decode_test data/lang data/test exp/sgmm2x_6a/decode_test exp/sgmm2x_6a_mmi_b0.2/decode_test_it$iter &
+ done
+
+wait
+steps/decode_combine.sh data/test data/lang exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode || exit 1;
+steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_mmi/decode exp/combine_sgmm2x_4a_3b/decode || exit 1;
+# combining the sgmm run and the best MMI+fMMI run.
+steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_fmmi_c/decode_it5 exp/combine_sgmm2x_4a_3b_fmmic5/decode || exit 1;
+
+steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a_mmi_b0.2/decode_it4 exp/tri3b_fmmi_c/decode_it5 exp/combine_sgmm2x_4a_mmi_3b_fmmic5/decode || exit 1;
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh
new file mode 100755
index 00000000000..21b793a4d27
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+oracle_dir=exp/tri5a/decode_callhome_test/oracle
+split=callhome_test
+data_dir=data/callhome_test
+lang_dir=data/lang
+
+# Make sure that your STM and CTM files are in UTF-8 encoding
+# Any other encoding will cause this script to fail/misbehave
+
+if [ ! -e $oracle_dir -o ! -e $data_dir -o ! -e $lang_dir ]; then
+  echo "Missing pre-requisites"
+  exit 1
+fi
+
+for i in {5..20}; do
+    mkdir -p $oracle_dir/score_$i
+    cp $oracle_dir/$split.ctm $oracle_dir/score_$i/
+done
+
+. /export/babel/data/software/env.sh
+
+# Start scoring
+/export/a11/guoguo/babel/103-bengali-limitedLP.official/local/score_stm.sh $data_dir $lang_dir \
+    $oracle_dir
+
+# Print a summary of the result
+grep "Percent Total Error" $oracle_dir/score_*/$split.ctm.dtl
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev
new file mode 100644
index 00000000000..77e3b01786f
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev
@@ -0,0 +1,20 @@
+sp_0897.sph
+sp_0968.sph
+sp_0981.sph
+sp_1062.sph
+sp_1292.sph
+sp_1411.sph
+sp_1413.sph
+sp_1552.sph
+sp_1554.sph
+sp_1805.sph
+sp_1808.sph
+sp_1882.sph
+sp_1930.sph
+sp_1947.sph
+sp_2037.sph
+sp_2054.sph
+sp_2057.sph
+sp_2107.sph
+sp_2109.sph
+sp_2144.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev
new file mode 100644
index 00000000000..77e3b01786f
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev
@@ -0,0 +1,20 @@
+sp_0897.sph
+sp_0968.sph
+sp_0981.sph
+sp_1062.sph
+sp_1292.sph
+sp_1411.sph
+sp_1413.sph
+sp_1552.sph
+sp_1554.sph
+sp_1805.sph
+sp_1808.sph
+sp_1882.sph
+sp_1930.sph
+sp_1947.sph
+sp_2037.sph
+sp_2054.sph
+sp_2057.sph
+sp_2107.sph
+sp_2109.sph
+sp_2144.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test
new file mode 100644
index 00000000000..0cbc3cc95fd
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test
@@ -0,0 +1,20 @@
+sp_0053.sph
+sp_0082.sph
+sp_0084.sph
+sp_0088.sph
+sp_0681.sph
+sp_0699.sph
+sp_0776.sph
+sp_0857.sph
+sp_1031.sph
+sp_1100.sph
+sp_1148.sph
+sp_1156.sph
+sp_1186.sph
+sp_1212.sph
+sp_1345.sph
+sp_1435.sph
+sp_1578.sph
+sp_1648.sph
+sp_1807.sph
+sp_1847.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train
new file mode 100644
index 00000000000..2c936072534
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train
@@ -0,0 +1,80 @@
+sp_0085.sph
+sp_0096.sph
+sp_0098.sph
+sp_0100.sph
+sp_0291.sph
+sp_0713.sph
+sp_0724.sph
+sp_0726.sph
+sp_0731.sph
+sp_0733.sph
+sp_0753.sph
+sp_0788.sph
+sp_0826.sph
+sp_0831.sph
+sp_0836.sph
+sp_0841.sph
+sp_0850.sph
+sp_0855.sph
+sp_0892.sph
+sp_0899.sph
+sp_0910.sph
+sp_0917.sph
+sp_0919.sph
+sp_0923.sph
+sp_0945.sph
+sp_0950.sph
+sp_0951.sph
+sp_0992.sph
+sp_0997.sph
+sp_1013.sph
+sp_1039.sph
+sp_1044.sph
+sp_1045.sph
+sp_1058.sph
+sp_1060.sph
+sp_1063.sph
+sp_1081.sph
+sp_1106.sph
+sp_1122.sph
+sp_1140.sph
+sp_1175.sph
+sp_1195.sph
+sp_1198.sph
+sp_1231.sph
+sp_1234.sph
+sp_1255.sph
+sp_1260.sph
+sp_1261.sph
+sp_1262.sph
+sp_1264.sph
+sp_1266.sph
+sp_1273.sph
+sp_1275.sph
+sp_1284.sph
+sp_1286.sph
+sp_1304.sph
+sp_1308.sph
+sp_1333.sph
+sp_1341.sph
+sp_1353.sph
+sp_1368.sph
+sp_1379.sph
+sp_1384.sph
+sp_1449.sph
+sp_1463.sph
+sp_1574.sph
+sp_1740.sph
+sp_1759.sph
+sp_1849.sph
+sp_1908.sph
+sp_1915.sph
+sp_1918.sph
+sp_1974.sph
+sp_1976.sph
+sp_1988.sph
+sp_2000.sph
+sp_2056.sph
+sp_2070.sph
+sp_2091.sph
+sp_2101.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev
new file mode 100644
index 00000000000..d3769f0ffb5
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev
@@ -0,0 +1,20 @@
+20051009_182032_217_fsp.sph
+20051009_210519_219_fsp.sph
+20051010_212418_225_fsp.sph
+20051016_180547_265_fsp.sph
+20051016_210626_267_fsp.sph
+20051017_180712_270_fsp.sph
+20051017_220530_275_fsp.sph
+20051017_234550_276_fsp.sph
+20051018_210220_279_fsp.sph
+20051018_210744_280_fsp.sph
+20051019_190221_288_fsp.sph
+20051019_210146_289_fsp.sph
+20051019_230329_292_fsp.sph
+20051022_180817_311_fsp.sph
+20051023_232057_325_fsp.sph
+20051024_180453_327_fsp.sph
+20051024_181110_329_fsp.sph
+20051025_212334_337_fsp.sph
+20051026_180724_341_fsp.sph
+20051026_211309_346_fsp.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2
new file mode 100644
index 00000000000..f1b5c293d67
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2
@@ -0,0 +1,20 @@
+20050909_210655_26_fsp.sph
+20050910_210708_33_fsp.sph
+20050913_210933_49_fsp.sph
+20050913_211649_50_fsp.sph
+20050915_210434_65_fsp.sph
+20050916_180332_68_fsp.sph
+20050918_180733_81_fsp.sph
+20050918_210841_82_fsp.sph
+20050920_212030_93_fsp.sph
+20050921_210443_99_fsp.sph
+20050923_211304_115_fsp.sph
+20050925_180713_120_fsp.sph
+20050925_180825_121_fsp.sph
+20050926_180516_125_fsp.sph
+20050926_180555_126_fsp.sph
+20050928_000254_141_fsp.sph
+20050930_210540_161_fsp.sph
+20051002_180726_170_fsp.sph
+20051007_181850_205_fsp.sph
+20051007_191217_206_fsp.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test
new file mode 100644
index 00000000000..6190ced077c
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test
@@ -0,0 +1,20 @@
+20051028_180633_356_fsp.sph
+20051029_211606_365_fsp.sph
+20051030_193924_371_fsp.sph
+20051101_212731_386_fsp.sph
+20051102_134901_389_fsp.sph
+20051102_180402_391_fsp.sph
+20051102_181501_393_fsp.sph
+20051103_211105_404_fsp.sph
+20051103_233456_406_fsp.sph
+20051107_184634_438_fsp.sph
+20051109_180253_445_fsp.sph
+20051109_210353_450_fsp.sph
+20051111_181045_470_fsp.sph
+20051111_182216_472_fsp.sph
+20051112_181649_485_fsp.sph
+20051113_155059_492_fsp.sph
+20051113_210221_496_fsp.sph
+20051113_214925_498_fsp.sph
+20051114_181749_505_fsp.sph
+20051115_212123_516_fsp.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train
new file mode 100644
index 00000000000..b57683842b2
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train
@@ -0,0 +1,759 @@
+20050908_182943_22_fsp.sph
+20050908_191808_23_fsp.sph
+20050909_210428_25_fsp.sph
+20050909_221657_28_fsp.sph
+20050910_180310_29_fsp.sph
+20050910_180330_30_fsp.sph
+20050910_181354_31_fsp.sph
+20050910_190223_32_fsp.sph
+20050911_180647_34_fsp.sph
+20050911_200216_35_fsp.sph
+20050911_210429_36_fsp.sph
+20050911_210530_37_fsp.sph
+20050911_210904_38_fsp.sph
+20050912_181441_40_fsp.sph
+20050912_181538_41_fsp.sph
+20050912_182044_42_fsp.sph
+20050912_212913_43_fsp.sph
+20050913_180324_44_fsp.sph
+20050913_180731_46_fsp.sph
+20050913_180947_47_fsp.sph
+20050913_210409_48_fsp.sph
+20050914_000831_51_fsp.sph
+20050914_180332_52_fsp.sph
+20050914_180606_53_fsp.sph
+20050914_181020_54_fsp.sph
+20050914_210243_55_fsp.sph
+20050914_210822_56_fsp.sph
+20050914_220753_58_fsp.sph
+20050915_180728_60_fsp.sph
+20050915_180740_61_fsp.sph
+20050915_192457_62_fsp.sph
+20050915_194045_63_fsp.sph
+20050915_210200_64_fsp.sph
+20050915_210916_66_fsp.sph
+20050915_212325_67_fsp.sph
+20050916_180740_69_fsp.sph
+20050916_200334_70_fsp.sph
+20050916_210235_71_fsp.sph
+20050916_210510_72_fsp.sph
+20050916_223656_73_fsp.sph
+20050917_210406_74_fsp.sph
+20050917_210805_75_fsp.sph
+20050917_211045_76_fsp.sph
+20050917_212041_77_fsp.sph
+20050918_180326_80_fsp.sph
+20050919_000612_83_fsp.sph
+20050919_180511_84_fsp.sph
+20050919_180703_85_fsp.sph
+20050919_180925_86_fsp.sph
+20050919_190254_87_fsp.sph
+20050920_180330_88_fsp.sph
+20050920_180342_89_fsp.sph
+20050920_180607_90_fsp.sph
+20050920_181919_91_fsp.sph
+20050920_211414_92_fsp.sph
+20050920_230520_94_fsp.sph
+20050921_180639_95_fsp.sph
+20050921_181002_96_fsp.sph
+20050921_210340_98_fsp.sph
+20050921_211329_101_fsp.sph
+20050921_221625_102_fsp.sph
+20050922_180618_103_fsp.sph
+20050922_180948_104_fsp.sph
+20050922_210740_106_fsp.sph
+20050922_211003_107_fsp.sph
+20050922_230412_108_fsp.sph
+20050923_180514_110_fsp.sph
+20050923_180530_111_fsp.sph
+20050923_210442_114_fsp.sph
+20050924_180747_117_fsp.sph
+20050924_181124_118_fsp.sph
+20050925_210645_122_fsp.sph
+20050925_231407_123_fsp.sph
+20050926_000425_124_fsp.sph
+20050926_180719_127_fsp.sph
+20050926_220244_130_fsp.sph
+20050926_230706_131_fsp.sph
+20050927_180422_132_fsp.sph
+20050927_181033_133_fsp.sph
+20050927_181232_134_fsp.sph
+20050927_210320_135_fsp.sph
+20050927_210848_136_fsp.sph
+20050927_210947_138_fsp.sph
+20050927_211929_139_fsp.sph
+20050927_231016_140_fsp.sph
+20050928_180631_142_fsp.sph
+20050928_210256_144_fsp.sph
+20050928_210700_145_fsp.sph
+20050928_211113_146_fsp.sph
+20050928_220320_147_fsp.sph
+20050928_232236_148_fsp.sph
+20050929_180318_149_fsp.sph
+20050929_180722_150_fsp.sph
+20050929_180932_151_fsp.sph
+20050929_211337_153_fsp.sph
+20050929_220820_154_fsp.sph
+20050929_230406_155_fsp.sph
+20050930_180329_156_fsp.sph
+20050930_180411_157_fsp.sph
+20050930_180646_158_fsp.sph
+20050930_200308_159_fsp.sph
+20051001_180328_163_fsp.sph
+20051001_181004_164_fsp.sph
+20051001_210749_166_fsp.sph
+20051001_211346_167_fsp.sph
+20051002_180339_169_fsp.sph
+20051002_210324_171_fsp.sph
+20051002_220651_174_fsp.sph
+20051003_180434_175_fsp.sph
+20051003_211042_178_fsp.sph
+20051003_220633_179_fsp.sph
+20051004_180351_180_fsp.sph
+20051004_180542_181_fsp.sph
+20051004_180730_182_fsp.sph
+20051004_200737_183_fsp.sph
+20051004_211611_185_fsp.sph
+20051005_180420_187_fsp.sph
+20051005_180709_188_fsp.sph
+20051005_213606_191_fsp.sph
+20051005_220917_192_fsp.sph
+20051005_230659_193_fsp.sph
+20051006_180416_194_fsp.sph
+20051006_180653_195_fsp.sph
+20051006_180815_196_fsp.sph
+20051006_181525_197_fsp.sph
+20051006_183153_199_fsp.sph
+20051006_210246_200_fsp.sph
+20051006_210417_201_fsp.sph
+20051006_220329_203_fsp.sph
+20051008_000036_208_fsp.sph
+20051008_180249_209_fsp.sph
+20051008_181720_210_fsp.sph
+20051008_183224_211_fsp.sph
+20051008_190256_212_fsp.sph
+20051008_211712_214_fsp.sph
+20051008_213416_215_fsp.sph
+20051009_180444_216_fsp.sph
+20051009_190753_218_fsp.sph
+20051009_220443_221_fsp.sph
+20051010_180650_222_fsp.sph
+20051010_182706_223_fsp.sph
+20051010_210622_224_fsp.sph
+20051010_222853_227_fsp.sph
+20051010_231630_228_fsp.sph
+20051011_181919_230_fsp.sph
+20051011_211026_232_fsp.sph
+20051011_220348_233_fsp.sph
+20051012_180233_234_fsp.sph
+20051012_190241_236_fsp.sph
+20051012_193952_237_fsp.sph
+20051012_224157_239_fsp.sph
+20051013_180458_240_fsp.sph
+20051013_180613_241_fsp.sph
+20051013_180700_242_fsp.sph
+20051013_182213_244_fsp.sph
+20051013_210221_245_fsp.sph
+20051013_210425_246_fsp.sph
+20051013_210941_247_fsp.sph
+20051013_220243_248_fsp.sph
+20051014_180259_249_fsp.sph
+20051014_180940_250_fsp.sph
+20051014_180948_251_fsp.sph
+20051014_183707_252_fsp.sph
+20051014_210348_253_fsp.sph
+20051014_210647_254_fsp.sph
+20051014_220227_256_fsp.sph
+20051014_230339_257_fsp.sph
+20051015_180549_258_fsp.sph
+20051015_190247_259_fsp.sph
+20051015_210138_260_fsp.sph
+20051015_210701_261_fsp.sph
+20051015_210831_262_fsp.sph
+20051016_180926_266_fsp.sph
+20051017_000346_269_fsp.sph
+20051017_210137_273_fsp.sph
+20051017_215732_274_fsp.sph
+20051018_180559_277_fsp.sph
+20051018_180816_278_fsp.sph
+20051018_211701_282_fsp.sph
+20051018_231046_283_fsp.sph
+20051018_235317_284_fsp.sph
+20051019_180448_285_fsp.sph
+20051019_183344_287_fsp.sph
+20051020_180339_293_fsp.sph
+20051020_180759_295_fsp.sph
+20051020_210218_297_fsp.sph
+20051020_212525_299_fsp.sph
+20051020_222944_300_fsp.sph
+20051020_234953_301_fsp.sph
+20051021_180218_302_fsp.sph
+20051021_180508_303_fsp.sph
+20051021_190605_304_fsp.sph
+20051021_210159_305_fsp.sph
+20051021_210530_306_fsp.sph
+20051021_222225_307_fsp.sph
+20051022_001311_309_fsp.sph
+20051022_180452_310_fsp.sph
+20051022_180829_312_fsp.sph
+20051022_190406_313_fsp.sph
+20051022_200517_314_fsp.sph
+20051022_210920_315_fsp.sph
+20051022_230324_316_fsp.sph
+20051022_232428_317_fsp.sph
+20051023_180342_318_fsp.sph
+20051023_180530_319_fsp.sph
+20051023_190301_321_fsp.sph
+20051023_210258_322_fsp.sph
+20051023_210605_323_fsp.sph
+20051023_223751_324_fsp.sph
+20051024_000348_326_fsp.sph
+20051024_180624_328_fsp.sph
+20051024_210748_330_fsp.sph
+20051024_211346_331_fsp.sph
+20051024_221753_332_fsp.sph
+20051024_230857_333_fsp.sph
+20051025_180351_334_fsp.sph
+20051025_210532_335_fsp.sph
+20051025_210959_336_fsp.sph
+20051025_220419_338_fsp.sph
+20051026_180611_340_fsp.sph
+20051026_190359_343_fsp.sph
+20051026_210334_344_fsp.sph
+20051026_211202_345_fsp.sph
+20051026_230956_347_fsp.sph
+20051026_234001_348_fsp.sph
+20051027_180217_349_fsp.sph
+20051027_210159_351_fsp.sph
+20051027_210333_352_fsp.sph
+20051027_211525_353_fsp.sph
+20051027_231329_354_fsp.sph
+20051028_180329_355_fsp.sph
+20051028_210350_358_fsp.sph
+20051028_211904_359_fsp.sph
+20051029_200218_363_fsp.sph
+20051029_210442_364_fsp.sph
+20051029_220538_366_fsp.sph
+20051030_000333_367_fsp.sph
+20051030_180521_368_fsp.sph
+20051030_181001_369_fsp.sph
+20051030_190231_370_fsp.sph
+20051030_210903_372_fsp.sph
+20051030_230444_373_fsp.sph
+20051031_180213_374_fsp.sph
+20051031_180906_375_fsp.sph
+20051031_210229_377_fsp.sph
+20051031_220447_379_fsp.sph
+20051101_153940_380_fsp.sph
+20051101_211314_384_fsp.sph
+20051101_223911_387_fsp.sph
+20051101_230216_388_fsp.sph
+20051102_175957_390_fsp.sph
+20051102_210243_394_fsp.sph
+20051102_210828_395_fsp.sph
+20051102_211130_396_fsp.sph
+20051103_163507_398_fsp.sph
+20051103_180920_400_fsp.sph
+20051103_185102_401_fsp.sph
+20051103_210539_403_fsp.sph
+20051103_223906_405_fsp.sph
+20051104_123901_407_fsp.sph
+20051104_180145_408_fsp.sph
+20051104_181437_409_fsp.sph
+20051104_190247_410_fsp.sph
+20051104_210307_411_fsp.sph
+20051104_210814_412_fsp.sph
+20051104_212121_413_fsp.sph
+20051104_222117_414_fsp.sph
+20051104_231424_416_fsp.sph
+20051105_175657_418_fsp.sph
+20051105_181203_419_fsp.sph
+20051105_210724_421_fsp.sph
+20051105_220745_422_fsp.sph
+20051106_180232_424_fsp.sph
+20051106_181321_425_fsp.sph
+20051106_190219_426_fsp.sph
+20051106_200213_427_fsp.sph
+20051106_210215_428_fsp.sph
+20051106_210310_429_fsp.sph
+20051106_211252_430_fsp.sph
+20051106_211804_431_fsp.sph
+20051106_215339_432_fsp.sph
+20051106_221653_433_fsp.sph
+20051107_115855_434_fsp.sph
+20051107_160351_435_fsp.sph
+20051107_180332_436_fsp.sph
+20051107_182401_437_fsp.sph
+20051107_210309_439_fsp.sph
+20051107_212723_440_fsp.sph
+20051108_145902_441_fsp.sph
+20051108_181424_442_fsp.sph
+20051108_210224_443_fsp.sph
+20051108_212018_444_fsp.sph
+20051109_180413_446_fsp.sph
+20051109_181432_447_fsp.sph
+20051109_181906_448_fsp.sph
+20051109_183631_449_fsp.sph
+20051109_210436_451_fsp.sph
+20051109_211151_452_fsp.sph
+20051109_212148_453_fsp.sph
+20051109_232505_454_fsp.sph
+20051110_155523_455_fsp.sph
+20051110_180208_456_fsp.sph
+20051110_180838_457_fsp.sph
+20051110_182221_459_fsp.sph
+20051110_182318_460_fsp.sph
+20051110_210200_461_fsp.sph
+20051110_210233_462_fsp.sph
+20051110_210454_463_fsp.sph
+20051110_211110_464_fsp.sph
+20051110_212818_466_fsp.sph
+20051110_225245_467_fsp.sph
+20051111_181441_471_fsp.sph
+20051111_184451_474_fsp.sph
+20051111_190326_475_fsp.sph
+20051111_194004_477_fsp.sph
+20051111_201357_478_fsp.sph
+20051111_230329_480_fsp.sph
+20051112_000305_482_fsp.sph
+20051112_165916_483_fsp.sph
+20051112_185651_487_fsp.sph
+20051112_190443_488_fsp.sph
+20051112_210205_489_fsp.sph
+20051112_210631_490_fsp.sph
+20051112_231502_491_fsp.sph
+20051113_180809_493_fsp.sph
+20051113_210908_497_fsp.sph
+20051113_220433_499_fsp.sph
+20051114_171942_502_fsp.sph
+20051114_181118_504_fsp.sph
+20051114_210412_506_fsp.sph
+20051114_212032_507_fsp.sph
+20051114_215057_508_fsp.sph
+20051114_220412_509_fsp.sph
+20051114_225557_510_fsp.sph
+20051115_134012_511_fsp.sph
+20051115_180301_512_fsp.sph
+20051115_181412_513_fsp.sph
+20051115_181731_514_fsp.sph
+20051115_182149_515_fsp.sph
+20051115_213551_517_fsp.sph
+20051115_215935_518_fsp.sph
+20051115_230749_520_fsp.sph
+20051116_000221_521_fsp.sph
+20051116_172353_522_fsp.sph
+20051116_180237_524_fsp.sph
+20051116_181228_525_fsp.sph
+20051116_181816_526_fsp.sph
+20051116_190450_527_fsp.sph
+20051116_210146_528_fsp.sph
+20051116_210553_529_fsp.sph
+20051116_211222_530_fsp.sph
+20051116_212312_531_fsp.sph
+20051116_222454_532_fsp.sph
+20051116_233038_533_fsp.sph
+20051117_001013_534_fsp.sph
+20051117_180234_535_fsp.sph
+20051117_181844_537_fsp.sph
+20051117_210156_538_fsp.sph
+20051117_210403_539_fsp.sph
+20051117_211540_540_fsp.sph
+20051117_211833_541_fsp.sph
+20051117_212855_542_fsp.sph
+20051117_213407_543_fsp.sph
+20051117_220412_544_fsp.sph
+20051117_225943_545_fsp.sph
+20051118_180619_547_fsp.sph
+20051118_180739_548_fsp.sph
+20051118_182114_549_fsp.sph
+20051118_182652_550_fsp.sph
+20051118_210212_551_fsp.sph
+20051118_210455_552_fsp.sph
+20051118_212058_553_fsp.sph
+20051118_212829_554_fsp.sph
+20051119_000355_555_fsp.sph
+20051119_181105_556_fsp.sph
+20051119_210802_557_fsp.sph
+20051119_212315_559_fsp.sph
+20051119_214926_560_fsp.sph
+20051120_181008_561_fsp.sph
+20051120_181339_562_fsp.sph
+20051120_190412_563_fsp.sph
+20051120_205645_565_fsp.sph
+20051120_210347_566_fsp.sph
+20051120_211526_567_fsp.sph
+20051121_181138_569_fsp.sph
+20051121_181357_570_fsp.sph
+20051121_190155_571_fsp.sph
+20051121_210922_573_fsp.sph
+20051122_181114_574_fsp.sph
+20051122_190326_576_fsp.sph
+20051122_210253_577_fsp.sph
+20051122_210703_578_fsp.sph
+20051122_211805_579_fsp.sph
+20051122_213037_580_fsp.sph
+20051122_215430_581_fsp.sph
+20051123_180926_582_fsp.sph
+20051123_181644_583_fsp.sph
+20051123_210214_584_fsp.sph
+20051123_211514_585_fsp.sph
+20051123_212412_586_fsp.sph
+20051123_213259_587_fsp.sph
+20051124_181720_588_fsp.sph
+20051124_190336_589_fsp.sph
+20051124_212221_591_fsp.sph
+20051124_220457_592_fsp.sph
+20051125_181632_593_fsp.sph
+20051125_190327_594_fsp.sph
+20051125_212150_595_fsp.sph
+20051126_181804_597_fsp.sph
+20051126_190347_598_fsp.sph
+20051126_210222_599_fsp.sph
+20051127_181335_601_fsp.sph
+20051127_190405_602_fsp.sph
+20051127_210516_603_fsp.sph
+20051127_211200_604_fsp.sph
+20051127_212516_605_fsp.sph
+20051128_215149_608_fsp.sph
+20051128_222007_609_fsp.sph
+20051129_180204_610_fsp.sph
+20051129_181241_612_fsp.sph
+20051129_181547_613_fsp.sph
+20051129_183449_614_fsp.sph
+20051129_190152_615_fsp.sph
+20051129_210218_616_fsp.sph
+20051129_210342_617_fsp.sph
+20051129_212711_618_fsp.sph
+20051130_181543_619_fsp.sph
+20051130_182626_620_fsp.sph
+20051130_210202_622_fsp.sph
+20051130_210910_623_fsp.sph
+20051130_212724_626_fsp.sph
+20051130_220121_627_fsp.sph
+20051130_221538_628_fsp.sph
+20051201_181034_630_fsp.sph
+20051201_181303_631_fsp.sph
+20051201_183429_632_fsp.sph
+20051201_191426_633_fsp.sph
+20051201_193415_634_fsp.sph
+20051201_195005_635_fsp.sph
+20051201_210713_636_fsp.sph
+20051201_212329_637_fsp.sph
+20051201_230640_638_fsp.sph
+20051202_181119_639_fsp.sph
+20051202_181659_640_fsp.sph
+20051202_182058_641_fsp.sph
+20051202_184713_642_fsp.sph
+20051202_190154_643_fsp.sph
+20051202_193515_644_fsp.sph
+20051202_210252_645_fsp.sph
+20051202_211824_646_fsp.sph
+20051202_212105_647_fsp.sph
+20051203_180701_649_fsp.sph
+20051203_182100_650_fsp.sph
+20051203_182132_651_fsp.sph
+20051203_182418_652_fsp.sph
+20051203_183501_653_fsp.sph
+20051203_190503_654_fsp.sph
+20051203_191125_655_fsp.sph
+20051203_210216_656_fsp.sph
+20051203_212114_658_fsp.sph
+20051203_222533_661_fsp.sph
+20051206_180753_662_fsp.sph
+20051206_180911_663_fsp.sph
+20051206_181649_664_fsp.sph
+20051206_183057_665_fsp.sph
+20051206_193937_667_fsp.sph
+20051206_201757_668_fsp.sph
+20051206_203158_669_fsp.sph
+20051206_210127_670_fsp.sph
+20051206_210744_671_fsp.sph
+20051206_211522_672_fsp.sph
+20051206_213252_673_fsp.sph
+20051206_214122_674_fsp.sph
+20051206_231328_675_fsp.sph
+20051207_180507_676_fsp.sph
+20051207_181020_677_fsp.sph
+20051207_190155_678_fsp.sph
+20051207_190426_679_fsp.sph
+20051207_193103_681_fsp.sph
+20051207_211858_683_fsp.sph
+20051207_212300_684_fsp.sph
+20051207_212831_685_fsp.sph
+20051207_214411_686_fsp.sph
+20051208_180208_687_fsp.sph
+20051208_180810_688_fsp.sph
+20051208_182430_689_fsp.sph
+20051208_190333_690_fsp.sph
+20051208_210609_691_fsp.sph
+20051208_211702_692_fsp.sph
+20051208_212444_694_fsp.sph
+20051208_214100_696_fsp.sph
+20051208_220606_697_fsp.sph
+20051209_180824_699_fsp.sph
+20051209_181542_700_fsp.sph
+20051209_181642_701_fsp.sph
+20051209_182541_702_fsp.sph
+20051209_182858_703_fsp.sph
+20051209_210136_704_fsp.sph
+20051209_210452_705_fsp.sph
+20051209_211542_706_fsp.sph
+20051209_212515_707_fsp.sph
+20051209_222427_709_fsp.sph
+20051209_231702_710_fsp.sph
+20051210_180659_711_fsp.sph
+20051210_181201_712_fsp.sph
+20051210_182013_713_fsp.sph
+20051210_182603_714_fsp.sph
+20051210_190201_715_fsp.sph
+20051210_210535_717_fsp.sph
+20051210_210735_718_fsp.sph
+20051211_000414_719_fsp.sph
+20051211_181346_720_fsp.sph
+20051211_182045_721_fsp.sph
+20051211_184252_723_fsp.sph
+20051211_190523_724_fsp.sph
+20051211_210240_725_fsp.sph
+20051211_211415_726_fsp.sph
+20051212_180251_727_fsp.sph
+20051212_181817_728_fsp.sph
+20051212_182453_729_fsp.sph
+20051212_190335_730_fsp.sph
+20051212_210527_731_fsp.sph
+20051212_210738_732_fsp.sph
+20051212_211419_733_fsp.sph
+20051212_213447_734_fsp.sph
+20051212_214512_735_fsp.sph
+20051213_180254_736_fsp.sph
+20051213_185913_737_fsp.sph
+20051213_191741_738_fsp.sph
+20051213_210120_739_fsp.sph
+20051213_211552_741_fsp.sph
+20051213_211953_742_fsp.sph
+20051213_221424_743_fsp.sph
+20051213_222016_744_fsp.sph
+20051214_193942_746_fsp.sph
+20051214_194606_747_fsp.sph
+20051214_201000_748_fsp.sph
+20051214_202717_749_fsp.sph
+20051214_211653_750_fsp.sph
+20051214_212318_751_fsp.sph
+20051214_212718_752_fsp.sph
+20051214_213225_753_fsp.sph
+20051215_180855_754_fsp.sph
+20051215_181731_755_fsp.sph
+20051215_182213_756_fsp.sph
+20051215_190143_757_fsp.sph
+20051215_190419_758_fsp.sph
+20051215_195526_759_fsp.sph
+20051215_200925_760_fsp.sph
+20051215_201639_761_fsp.sph
+20051215_203848_762_fsp.sph
+20051215_210410_764_fsp.sph
+20051215_212456_766_fsp.sph
+20051215_212701_767_fsp.sph
+20051215_212749_768_fsp.sph
+20051215_214814_769_fsp.sph
+20051215_220537_770_fsp.sph
+20051215_222306_771_fsp.sph
+20051216_181042_773_fsp.sph
+20051216_182340_774_fsp.sph
+20051216_191101_775_fsp.sph
+20051216_192823_776_fsp.sph
+20051216_200153_777_fsp.sph
+20051216_211423_778_fsp.sph
+20051216_220626_779_fsp.sph
+20051217_142547_780_fsp.sph
+20051217_180231_781_fsp.sph
+20051217_182026_783_fsp.sph
+20051217_182330_784_fsp.sph
+20051217_182530_785_fsp.sph
+20051217_183115_786_fsp.sph
+20051217_190226_787_fsp.sph
+20051218_142845_790_fsp.sph
+20051218_180353_791_fsp.sph
+20051218_181751_792_fsp.sph
+20051218_182127_793_fsp.sph
+20051218_182750_794_fsp.sph
+20051218_200401_799_fsp.sph
+20051218_210249_800_fsp.sph
+20051218_211820_801_fsp.sph
+20051218_212444_802_fsp.sph
+20051218_212813_803_fsp.sph
+20051219_180225_804_fsp.sph
+20051219_182110_806_fsp.sph
+20051219_190625_808_fsp.sph
+20051219_210655_812_fsp.sph
+20051219_212218_813_fsp.sph
+20051219_212716_814_fsp.sph
+20051219_213203_815_fsp.sph
+20051219_221213_816_fsp.sph
+20051219_223123_817_fsp.sph
+20051220_181731_820_fsp.sph
+20051220_190121_821_fsp.sph
+20051220_212400_826_fsp.sph
+20051220_212718_828_fsp.sph
+20051220_213420_829_fsp.sph
+20051221_000417_830_fsp.sph
+20051221_180958_831_fsp.sph
+20051221_210452_840_fsp.sph
+20051221_212325_841_fsp.sph
+20051221_212911_842_fsp.sph
+20051222_000436_843_fsp.sph
+20051222_181242_845_fsp.sph
+20051222_181506_846_fsp.sph
+20051222_182617_847_fsp.sph
+20051222_184209_849_fsp.sph
+20051222_200553_850_fsp.sph
+20051222_210309_852_fsp.sph
+20051222_212425_855_fsp.sph
+20051223_180346_856_fsp.sph
+20051223_181050_857_fsp.sph
+20051223_183105_860_fsp.sph
+20051223_212547_863_fsp.sph
+20051223_212853_864_fsp.sph
+20051224_180302_865_fsp.sph
+20051224_182949_867_fsp.sph
+20051224_210150_870_fsp.sph
+20051224_213010_871_fsp.sph
+20051225_192042_872_fsp.sph
+20051225_210556_873_fsp.sph
+20051226_180908_874_fsp.sph
+20051226_181659_875_fsp.sph
+20051227_181058_885_fsp.sph
+20051227_211308_887_fsp.sph
+20051227_213029_888_fsp.sph
+20051227_214843_889_fsp.sph
+20051227_220309_890_fsp.sph
+20051228_180249_891_fsp.sph
+20051228_182051_892_fsp.sph
+20051228_183955_893_fsp.sph
+20051228_210524_896_fsp.sph
+20051228_211808_897_fsp.sph
+20051228_212304_899_fsp.sph
+20051228_212734_900_fsp.sph
+20051228_223227_901_fsp.sph
+20051229_180231_902_fsp.sph
+20051229_182614_906_fsp.sph
+20051229_182631_907_fsp.sph
+20051229_214024_909_fsp.sph
+20051230_180457_910_fsp.sph
+20051230_181721_912_fsp.sph
+20051230_210412_913_fsp.sph
+20051230_210559_914_fsp.sph
+20051230_212557_915_fsp.sph
+20051231_000808_916_fsp.sph
+20060103_180314_917_fsp.sph
+20060103_182107_918_fsp.sph
+20060103_182257_919_fsp.sph
+20060103_182549_920_fsp.sph
+20060103_182654_921_fsp.sph
+20060103_184037_922_fsp.sph
+20060103_211504_925_fsp.sph
+20060103_211732_926_fsp.sph
+20060104_180509_928_fsp.sph
+20060104_181040_929_fsp.sph
+20060104_182115_930_fsp.sph
+20060104_182644_931_fsp.sph
+20060104_190448_933_fsp.sph
+20060104_192707_934_fsp.sph
+20060104_210223_935_fsp.sph
+20060104_212844_936_fsp.sph
+20060104_220148_937_fsp.sph
+20060105_202127_943_fsp.sph
+20060105_205957_944_fsp.sph
+20060105_210951_945_fsp.sph
+20060105_211743_946_fsp.sph
+20060105_213129_947_fsp.sph
+20060105_213243_948_fsp.sph
+20060105_230711_949_fsp.sph
+20060106_180202_950_fsp.sph
+20060106_181040_951_fsp.sph
+20060106_181726_952_fsp.sph
+20060106_182909_953_fsp.sph
+20060106_183056_954_fsp.sph
+20060106_183550_955_fsp.sph
+20060106_185224_956_fsp.sph
+20060106_193129_957_fsp.sph
+20060107_180634_960_fsp.sph
+20060107_181553_961_fsp.sph
+20060107_182715_962_fsp.sph
+20060107_190206_963_fsp.sph
+20060107_190415_964_fsp.sph
+20060107_210435_966_fsp.sph
+20060107_220739_967_fsp.sph
+20060108_180630_968_fsp.sph
+20060108_194731_971_fsp.sph
+20060108_234917_976_fsp.sph
+20060109_180448_977_fsp.sph
+20060109_182557_979_fsp.sph
+20060109_183636_980_fsp.sph
+20060109_183727_981_fsp.sph
+20060109_205815_982_fsp.sph
+20060109_213409_986_fsp.sph
+20060109_215138_987_fsp.sph
+20060109_220315_988_fsp.sph
+20060109_220535_989_fsp.sph
+20060110_183405_995_fsp.sph
+20060110_200611_998_fsp.sph
+20060110_210730_1002_fsp.sph
+20060110_213516_1004_fsp.sph
+20060110_221920_1006_fsp.sph
+20060110_230947_1007_fsp.sph
+20060111_181650_1008_fsp.sph
+20060111_182557_1009_fsp.sph
+20060111_184916_1010_fsp.sph
+20060111_192159_1012_fsp.sph
+20060111_200345_1013_fsp.sph
+20060111_210257_1014_fsp.sph
+20060111_212145_1016_fsp.sph
+20060111_213742_1017_fsp.sph
+20060111_213936_1018_fsp.sph
+20060111_230912_1020_fsp.sph
+20060112_180639_1021_fsp.sph
+20060112_182612_1022_fsp.sph
+20060112_183346_1023_fsp.sph
+20060112_183622_1024_fsp.sph
+20060112_210747_1025_fsp.sph
+20060112_211025_1026_fsp.sph
+20060112_221010_1027_fsp.sph
+20060112_221022_1028_fsp.sph
+20060113_180159_1030_fsp.sph
+20060113_183452_1033_fsp.sph
+20060113_190403_1034_fsp.sph
+20060113_213733_1036_fsp.sph
+20060114_181137_1039_fsp.sph
+20060114_181922_1040_fsp.sph
+20060114_191056_1043_fsp.sph
+20060114_213242_1044_fsp.sph
+20060115_180421_1045_fsp.sph
+20060115_183525_1047_fsp.sph
+20060115_210217_1048_fsp.sph
+20060115_212231_1051_fsp.sph
+20060115_220504_1052_fsp.sph
+20060115_232345_1053_fsp.sph
+20060116_181908_1054_fsp.sph
+20060116_182500_1055_fsp.sph
+20060116_183201_1056_fsp.sph
+20060116_184141_1057_fsp.sph
+20060116_202324_1058_fsp.sph
+20060116_204753_1059_fsp.sph
+20060116_210217_1060_fsp.sph
+20060116_211237_1061_fsp.sph
+20060116_212845_1063_fsp.sph
+20060116_220652_1064_fsp.sph
+20060116_221118_1065_fsp.sph
+20060117_181936_1068_fsp.sph
+20060117_182604_1069_fsp.sph
+20060117_185153_1071_fsp.sph
+20060117_210138_1072_fsp.sph
+20060117_210311_1073_fsp.sph
+20060117_212546_1074_fsp.sph
+20060118_180229_1076_fsp.sph
+20060118_180647_1078_fsp.sph
+20060118_182448_1079_fsp.sph
+20060118_183010_1080_fsp.sph
+20060118_190231_1082_fsp.sph
+20060118_200148_1083_fsp.sph
+20060118_205216_1084_fsp.sph
+20060118_212907_1085_fsp.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test
new file mode 100644
index 00000000000..0cbc3cc95fd
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test
@@ -0,0 +1,20 @@
+sp_0053.sph
+sp_0082.sph
+sp_0084.sph
+sp_0088.sph
+sp_0681.sph
+sp_0699.sph
+sp_0776.sph
+sp_0857.sph
+sp_1031.sph
+sp_1100.sph
+sp_1148.sph
+sp_1156.sph
+sp_1186.sph
+sp_1212.sph
+sp_1345.sph
+sp_1435.sph
+sp_1578.sph
+sp_1648.sph
+sp_1807.sph
+sp_1847.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train
new file mode 100644
index 00000000000..2c936072534
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train
@@ -0,0 +1,80 @@
+sp_0085.sph
+sp_0096.sph
+sp_0098.sph
+sp_0100.sph
+sp_0291.sph
+sp_0713.sph
+sp_0724.sph
+sp_0726.sph
+sp_0731.sph
+sp_0733.sph
+sp_0753.sph
+sp_0788.sph
+sp_0826.sph
+sp_0831.sph
+sp_0836.sph
+sp_0841.sph
+sp_0850.sph
+sp_0855.sph
+sp_0892.sph
+sp_0899.sph
+sp_0910.sph
+sp_0917.sph
+sp_0919.sph
+sp_0923.sph
+sp_0945.sph
+sp_0950.sph
+sp_0951.sph
+sp_0992.sph
+sp_0997.sph
+sp_1013.sph
+sp_1039.sph
+sp_1044.sph
+sp_1045.sph
+sp_1058.sph
+sp_1060.sph
+sp_1063.sph
+sp_1081.sph
+sp_1106.sph
+sp_1122.sph
+sp_1140.sph
+sp_1175.sph
+sp_1195.sph
+sp_1198.sph
+sp_1231.sph
+sp_1234.sph
+sp_1255.sph
+sp_1260.sph
+sp_1261.sph
+sp_1262.sph
+sp_1264.sph
+sp_1266.sph
+sp_1273.sph
+sp_1275.sph
+sp_1284.sph
+sp_1286.sph
+sp_1304.sph
+sp_1308.sph
+sp_1333.sph
+sp_1341.sph
+sp_1353.sph
+sp_1368.sph
+sp_1379.sph
+sp_1384.sph
+sp_1449.sph
+sp_1463.sph
+sp_1574.sph
+sp_1740.sph
+sp_1759.sph
+sp_1849.sph
+sp_1908.sph
+sp_1915.sph
+sp_1918.sph
+sp_1974.sph
+sp_1976.sph
+sp_1988.sph
+sp_2000.sph
+sp_2056.sph
+sp_2070.sph
+sp_2091.sph
+sp_2101.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl
new file mode 100755
index 00000000000..03193384670
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl
@@ -0,0 +1,304 @@
+#!/usr/bin/env perl
+
+# Oct 21, 2015 : Gaurav Kumar (Johns Hopkins University)
+# GNU General Public License, v3.0
+#
+# This script was modified under GPL and is being distributed with 
+# Kaldi. It requires the preference and rule files
+# (under LDC copyright) from LDC96L16. The main changes were
+# - Outdated usage of perl conventions updated @_ => $_ or @A
+# - This script no longer needs the preference and rule files to 
+#   be in the same directory as this script.
+# - Accepts tokens from <STDIN> instead of <>
+
+# --- Retained previous version information ----------------------------
+# spron.pl Version 0.1 Jan. 11 1995 
+# Written by Zhibiao Wu, LDC, wzb@unagi.cis.upenn.edu
+# This program needs the basic_rules file to run. The rules must be sorted 
+# in alphabetical order. The most specific rules should precede the more 
+# general ones. The conventions used in the basic rules are the same as 
+# regular expressions used in Perl.
+
+# Revised history: Feb. 10 1995
+
+# The file "preferences" (assumed to be in your current directory)
+# gives an "oracle" of correct pronunciations that override the
+# machine-generated ones.
+
+# slightly changed 97/09/05 robertm:
+#  - look for basic_rules and preferences in $PWD instead of ~wzb/...
+#  - use next to shortcut loop instead of if/else
+#  - added a bit of documentation, without really trying to decipher this thing
+# -----------------------------------------------------------------------
+
+use utf8;
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+$vfile = "";
+$preference_file = "";
+$rules_file = "";
+$print_input = 0;
+if ($#ARGV < 1) {
+  # Print Usage
+  print "Usage : local/spron.pl pref-file rules-file <v-file> <print-input>\n";
+  exit 1;
+} else {
+  $preference_file = $ARGV[0];
+  $rules_file = $ARGV[1];
+  if ($#ARGV > 1) {
+    $vfile = $ARGV[2];
+  }
+  if ($#ARGV > 2) {
+    $print_input = 1;
+  }
+}
+
+$rule_num = 0;
+$previous = "";
+if ($vfile ne "") {
+  open(VF, $vfile) || die "Can't find file $vfile!\n";
+  while (<VF>) {
+    chop;
+    @A = split(//);
+    if (($A[0] ne '#') && ($_ ne "")) {
+      if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) {
+        $head[$rule_num] = $1;
+        $end[$rule_num] = $2;
+        $pre[$rule_num] = $3;
+        if ($4 =~ /#/) {
+          $nex[$rule_num] = "";
+          $some[$rule_num] = $4;
+        } else {
+          $nex[$rule_num] = $4;
+          $some[$rule_num] = $5;
+        }
+        if ($previous ne substr($head[$rule_num],0,1)) {
+          $first{$head[$rule_num]} = $rule_num;
+          $last{$previous} = $rule_num - 1;
+        }
+        $previous = substr($head[$rule_num++],0,1);
+      } else {
+        print "Rule format error: Cannot parse $_\n";
+        exit(1);
+      }
+    }
+  }
+  $last{$previous} = $rule_num - 1;
+
+  close(VF);
+}
+
+open(PF, $preference_file) || die "Can't read `preferences' file";
+binmode(PF, ":iso88591");
+while (<PF>) {
+  chop;
+  if ($_ ne "") {
+    @A = split;
+    $pron{$A[0]} = $A[1];
+    $stre{$A[0]} = $A[2];
+  }
+}
+
+$previous = "";
+$brule_num = 0;
+open(BF, $rules_file) || die "Can't read `basic_rules' file";
+binmode(BF, ":iso88591");
+while (<BF>) {
+  chop;
+  @A = split(//);
+  if (($A[0] ne '#') && ($_ ne "")) {
+    if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) {
+      $bhead[$brule_num] = $1;
+      $bend[$brule_num] = $2;
+      $bpre[$brule_num] = $3;
+      if ($4 =~ /#/) {
+        $bnex[$brule_num] = "";
+        $bsome[$brule_num] = $4;
+      } else {
+        $bnex[$brule_num] = $4;
+        $bsome[$brule_num] = $5;
+      }
+      if ($previous ne substr($bhead[$brule_num],0,1)) {
+        $bfirst{substr($bhead[$brule_num],0,1)} = $brule_num;
+        $blast{$previous} = $brule_num - 1;
+      }
+      $previous = substr($bhead[$brule_num++],0,1);
+    } else {
+      print "Rule format error in file basic_rules: Cannot parse $_\n";
+      exit(1);
+    }
+  }
+}
+$blast{$previous} = $brule_num - 1;
+close(BF);
+
+if ($brule_num == 0) {
+  print "No basic rules, Program exit!\n";
+  exit(1);
+}
+
+while(<STDIN>){
+  next if ((/^#/) || (/^\s*$/) );
+  chop;
+  if ($print_input) {
+    print $_, "\t";
+  }
+  if ($pron{$_}) {
+    # print answer from preferences and skip to next word
+    print "$pron{$_}\t$stre{$_}\n";
+    next;
+  }
+  $original = $_;
+  tr/A-ZÁÉÍÓÚÏÜÑ/a-záéíóúïüñ/;
+  $orig = "#" . $_ . "#";
+
+  @l = ();
+
+  push(@l,split("",$orig));
+
+  @pron = &transfer(1);
+
+  foreach (@pron) {
+    $a = $_;
+    y/aeiouáéíóú//cd;
+    if ($_ eq "") {
+      print "#No stressable vowel in $original\n";
+    } else {
+      s/[aeiou]/0/go;
+      s/[áéíóú]/1/go;
+      if (!/1/) {
+        if(length() == 1){
+          s/\b./1/o;
+        } elsif($l[$#l - 1] =~ /[aeiouns]/o){
+          s/00\b/10/o;
+        } else {
+          s/0\b/1/o;
+        }
+      }
+
+      $a =~ s/á/a/g;
+      $a =~ s/é/e/g;
+      $a =~ s/í/i/g;
+      $a =~ s/ó/o/g;
+      $a =~ s/ú/u/g;
+
+      print "$a\t$_\n";
+    }
+  }
+}
+
+sub transfer{
+  local($_) = @_;
+  local(@p) = ();
+  local($s) = 0;
+  local($over) = 0;
+  local($i,$j,$k) = (0,0,0);
+
+  if ($_ >= length($orig) - 1) {
+    push(@p, "");
+    return(@p);
+  } else {
+
+    if ($vfile ne "") {
+      for ($i=   $first{substr($orig, $_, 1)}; 
+        $i <= $last{substr($orig, $_, 1)} ; $i++) {
+        if (&matchv($_,$i)) {
+          $s = $_ + length($head[$i]);
+          foreach $w (&transfer($s)) {
+            push(@p, $end[$i] . $w);
+            if ($some[$i] ne "") {
+              $over = 0;
+            } else {
+              $over = 1;
+            }
+          }
+        }
+      }
+    }
+
+    if ($over == 0 ) {
+      $i = $bfirst{substr($orig, $_, 1)}; 
+      while (($i <= $blast{substr($orig, $_, 1)}) && ($over == 0)) {
+        if (&matchb($_,$i)) {
+          $over = 1;
+          $s = $_ + length($bhead[$i]);
+          foreach $w (&transfer($s)) {
+            push(@p, $bend[$i] . $w);
+          }
+        }
+        $i++;
+      }
+      if ($over == 0) {
+        $s = $_ + 1;
+        foreach $w (&transfer($s)) {
+          push(@p, substr($orig,$_,1) . $w);
+        }
+      } 
+    }
+
+    return(@p);
+  }
+}
+
+sub matchv {
+  $h = $head[$_[1]];
+  $p = $pre[$_[1]];
+  $n = $nex[$_[1]];
+
+  return(&match($_[0],$h,$p,$n));
+
+}
+
+sub matchb {
+  $h = $bhead[$_[1]];
+  $p = $bpre[$_[1]];
+  $n = $bnex[$_[1]];
+
+  return(&match($_[0],$h,$p,$n));
+
+}
+
+sub match {
+
+  if (substr($orig, $_[0], length($_[1])) eq $_[1]) {
+    return ( &match_n($_[0] + length($_[1]) - 1, $_[3]) && 
+      &match_p($_[0], $_[2])); 
+  } else {
+    return (0);
+  }
+}
+
+sub match_p {
+  local($a) = $_[0];
+  local($b) = $_[1];
+  local($_);
+
+  if ($b eq "" ) {
+    return (1);
+  } else {
+    $_ = substr($orig, 0, $a) . "!";  
+    if (/($b)!/) {
+      return(1);
+    } else {
+      return(0);
+    }
+  }
+}
+
+sub match_n {
+  local($a) = $_[0];
+  local($b) = $_[1];
+  local($_);
+
+  if ($b eq "" ) {
+    return (1);
+  } else {
+    $_ = "!" . substr($orig, $a + 1, length($orig) - $a - 1);  
+    if (/!($b)/) {
+      return(1);
+    } else {
+      return(0);
+    }
+  }
+}
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh
new file mode 100755
index 00000000000..9f5855d56c4
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+#
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+# The input is a subset of the dataset in use. (*.sph files) 
+# In addition the transcripts are needed as well. 
+# This script is only called internally and should not be 
+# used for any other purpose. A similar script for general usage 
+# is local/fsp_data_prep.sh
+# To be run from one directory above this script.
+
+stage=0
+
+export LC_ALL=C
+
+
+if [ $# -lt 4 ]; then
+   echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories and the name of this partition
+, and a list of files that belong to this partition . see ../run.sh for example."
+   exit 1;
+fi
+
+subset=$3
+dir=`pwd`/data/local/$subset/data
+mkdir -p $dir
+local=`pwd`/local
+utils=`pwd`/utils
+tmpdir=`pwd`/data/local/tmp
+mkdir -p $tmpdir
+
+. ./path.sh || exit 1; # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+   exit 1;
+fi
+cd $dir
+
+# Make directory of links to the WSJ disks such as 11-13.1.  This relies on the command
+# line arguments being absolute pathnames.
+rm -r links/ 2>/dev/null
+mkdir links/
+mkdir links/speech
+mkdir links/transcripts
+if [ ! -f $4 ]; then
+  echo "Please specify a valid parition file. Could not find $4"
+  exit 1;
+fi
+cat $4 | sed 's:.*/::g' | \
+xargs -I % find $1/ -name %* | xargs -I % echo cp % links/
+
+# Basic spot checks to see if we got the data that we needed
+if [ ! -d links/LDC2010S01 -o ! -d links/LDC2010T04 ];
+then
+        echo "The speech and the data directories need to be named LDC2010S01 and LDC2010T04 respecti
+vely"
+        exit 1;
+fi
+
+if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ];
+then
+        echo "Disc 1 and 2 directories missing or not properly organised within the speech data dir"
+        echo "Typical format is LDC2010S01/DISC?/data/speech"
+        exit 1;
+fi
+
+#Check the transcripts directories as well to see if they exist
+if [ ! -d links/LDC2010T04/data/transcripts ];
+then
+        echo "Transcript directories missing or not properly organised"
+        echo "Typical format is LDC2010T04/data/transcripts"
+        exit 1;
+fi
+
+speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
+speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
+transcripts=$dir/links/LDC2010T04/data/transcripts                                 
+                                                                                   
+fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`                                             
+fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`                                             
+fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`                                            
+#TODO:it seems like not all speech files have transcripts             
+#Now check if we got all the files that we needed
+if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];                 
+then                                                                               
+        echo "Incorrect number of files in the data directories"                   
+        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"  
+        echo "The transcripts should contain 819 files"                            
+        exit 1;                                                                    
+fi   
+
+if [ $stage -le 0 ]; then
+  #Gather all the speech files together to create a file list
+  #TODO: Train and test split might be required
+  (
+      find $speech_d1 -iname '*.sph';
+      find $speech_d2 -iname '*.sph';
+  )  > $tmpdir/train_sph.flist
+
+  #Get all the transcripts in one place
+  find $transcripts -iname '*.tdf' > $tmpdir/train_transcripts.flist
+fi
+
+if [ $stage -le 1 ]; then
+  $local/fsp_make_trans.pl $tmpdir
+  mkdir -p $dir/train_all
+  mv $tmpdir/reco2file_and_channel $dir/train_all/
+fi
+
+if [ $stage -le 2 ]; then                                                        
+  sort $tmpdir/text.1 | grep -v '((' | \
+  awk '{if (NF > 1){ print; }}' | \
+  sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \
+  sed 's:<lname>\([^<]*\)<\/lname>:\1:g' | \
+  sed 's:<lname[\/]*>::g' | \
+  sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
+  sed 's:<\s*cough[\/]*>:[noise]:g' | \
+  sed 's:<sneeze[\/]*>:[noise]:g' | \
+  sed 's:<breath[\/]*>:[noise]:g' | \
+  sed 's:<lipsmack[\/]*>:[noise]:g' | \
+  sed 's:<background>[^<]*<\/background>:[noise]:g' | \
+  sed -r 's:<[/]?background[/]?>:[noise]:g' | \
+  #One more time to take care of nested stuff
+  sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
+  sed -r 's:<[/]?laugh[/]?>:[laughter]:g' | \
+  #now handle the exceptions, find a cleaner way to do this?
+  sed 's:<foreign langenglish::g' | \
+  sed 's:</foreign::g' | \
+  sed -r 's:<[/]?foreing\s*\w*>::g' | \
+  sed 's:</b::g' | \
+  sed 's:<foreign langengullís>::g' | \
+  sed 's:foreign>::g' | \
+  sed 's:>::g' | \
+  #How do you handle numbers?
+  grep -v '()' | \
+  #Now go after the non-printable characters
+  sed -r 's:¿::g' > $tmpdir/text.2
+  cp $tmpdir/text.2 $dir/train_all/text
+
+  #Create segments file and utt2spk file
+  ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \
+  && echo "Error producing utt2spk file" && exit 1;
+
+  cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2;
+ $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' >$dir/train_all/segments
+
+  $utils/utt2spk_to_spk2utt.pl <$dir/train_all/utt2spk > $dir/train_all/spk2utt
+fi
+
+if [ $stage -le 3 ]; then
+  cat $tmpdir/train_sph.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; ";  print "$1 $_"; ' > $tmpdir/sph.scp
+  cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
+  sort -k1,1 -u  > $dir/train_all/wav.scp || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl.
+  cat $tmpdir/spk2gendertmp | sort | uniq > $dir/train_all/spk2gender
+fi
+
+echo "Fisher Spanish Data preparation succeeded."
+
+exit 1;
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py
new file mode 100755
index 00000000000..ce83fa8c8aa
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+import os
+import sys
+
+files = [
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/exp/tri5a/decode_test/scoring/13.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/exp/tri5a/decode_test/scoring/13.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/exp/tri5a/decode_test/scoring/13.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/exp/tri5a/decode_test/scoring/13.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/exp/tri5a/decode_test/scoring/13.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/exp/tri5a/decode_test/scoring/13.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/exp/tri5a/decode_test/scoring/13.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/exp/tri5a/decode_test/scoring/13.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/exp/tri5a/decode_test/scoring/13.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/exp/tri5a/decode_test/scoring/13.tra')]
+
+def findTranscription(timeDetail):
+    
+    for file1 in files:
+        file1.seek(0,0)
+        for line in file1:
+            lineComp = line.split()
+            if lineComp[0] == timeDetail:
+                return " ".join(lineComp[1:])
+    # No result found
+    return -1
+
+
+wordsFile = open('exp/tri5a/graph/words.txt')
+words = {}
+
+# Extract word list
+for line in wordsFile:
+    lineComp = line.split()
+    words[int(lineComp[1])] = lineComp[0].strip()
+
+# Now read list of files in conversations
+fileList = []
+#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev')
+conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train')
+for line in conversationList: 
+    line = line.strip()
+    line = line[:-4]
+    fileList.append(line)
+
+# IN what order were the conversations added to the spanish files?
+# TODO: Make sure they match the order in which these english files are being written
+
+# Now get timing information to concatenate the ASR outputs
+if not os.path.exists('exp/tri5a/one-best/train'):
+    os.makedirs('exp/tri5a/one-best/train')
+
+#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+')
+provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train', 'w+')
+for item in fileList:
+    timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es')
+    newFile = open('exp/tri5a/one-best/train/' + item + '.es', 'w+')
+    for line in timingFile:
+        timeInfo = line.split()
+        mergedTranslation = ""
+        for timeDetail in timeInfo:
+            #Locate this in ASR dev/test, this is going to be very slow
+            tmp = findTranscription(timeDetail)
+            if tmp != -1:
+                mergedTranslation = mergedTranslation + " " + tmp
+        mergedTranslation = mergedTranslation.strip()
+        transWords = [words[int(x)] for x in mergedTranslation.split()]
+        newFile.write(" ".join(transWords) + "\n")
+        provFile.write(" ".join(transWords) + "\n")
+    newFile.close()
+provFile.close()
+
+
+
+
+
+   
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py
new file mode 100755
index 00000000000..b9f906b27da
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+from __future__ import print_function
+import os
+import sys
+import subprocess
+
+latticeLocation = {1:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/latjosh-2/lattices-pushed/",
+2:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/latjosh-2/lattices-pushed/",
+3:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/latjosh-2/lattices-pushed/",
+4:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/latjosh-2/lattices-pushed/",
+5:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/latjosh-2/lattices-pushed/",
+6:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/latjosh-2/lattices-pushed/",
+7:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/latjosh-2/lattices-pushed/",
+8:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/latjosh-2/lattices-pushed/",
+9:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/latjosh-2/lattices-pushed/",
+10:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/latjosh-2/lattices-pushed/"}
+
+latticeDict = {}
+
+for key,location in latticeLocation.items():
+    for root, dirs, filenames in os.walk(location):
+        for f in filenames:
+            latticeDict[f] = str(key)
+
+tmpdir = 'data/local/data/tmp/lattmp'
+if not os.path.exists(tmpdir):
+    os.makedirs(tmpdir)
+invalidplfdir = 'data/local/data/tmp/invalidplf'
+if not os.path.exists(invalidplfdir):
+    os.makedirs(invalidplfdir)
+else:
+    os.system("rm " + invalidplfdir + "/*")
+
+def latticeConcatenate(lat1, lat2):
+    '''
+    Concatenates lattices, writes temporary results to tmpdir
+    '''
+    if lat1 == "":
+        if os.path.exists('rm ' + tmpdir + '/tmp.lat'):
+            os.system('rm ' + tmpdir + '/tmp.lat')
+        return lat2
+    else:
+        proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')])
+        proc.wait()
+        return tmpdir + '/tmp.lat'
+
+
+def findLattice(timeDetail):
+    '''
+    Finds the lattice corresponding to a time segment
+    '''
+    searchKey = timeDetail + '.lat'
+    if searchKey in latticeDict:
+        return "/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-" + latticeDict[searchKey] + "/latjosh-2/lattices-pushed/" + searchKey
+    else:
+        return -1
+
+
+# Now read list of files in conversations
+fileList = []
+conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train')
+for line in conversationList:
+    line = line.strip()
+    line = line[:-4]
+    fileList.append(line)
+
+# IN what order were the conversations added to the spanish files?
+# Now get timing information to concatenate the ASR outputs
+
+provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train.plf', 'w+')
+lineNo = 1
+invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/invalidPLF', 'w+')
+blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/blankPLF', 'w+')
+rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/removeLines', 'w+')
+for item in fileList:
+    timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es')
+    for line in timingFile:
+        timeInfo = line.split()
+
+        # For utterances that are concatenated in the translation file, 
+        # the corresponding FSTs have to be translated as well
+        mergedTranslation = ""
+        for timeDetail in timeInfo:
+            tmp = findLattice(timeDetail)
+            if tmp != -1:
+                # Concatenate lattices
+                mergedTranslation = latticeConcatenate(mergedTranslation, tmp)
+
+        if mergedTranslation != "":
+            
+            # Sanjeev's Recipe : Remove epsilons and topo sort
+            finalFST = tmpdir + "/final.fst"
+            os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST)
+        
+            # Now convert to PLF
+            proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh /export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt ' + finalFST, stdout=subprocess.PIPE, shell=True)
+            PLFline = proc.stdout.readline()
+            finalPLFFile = tmpdir + "/final.plf"
+            finalPLF = open(finalPLFFile, "w+")
+            finalPLF.write(PLFline)
+            finalPLF.close()
+
+            # now check if this is a valid PLF, if not write it's ID in a 
+            # file so it can be checked later
+            proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
+            line = proc.stdout.readline()
+            print("{} {}".format(line, lineNo))
+            if line.strip() != "PLF format appears to be correct.":
+                os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
+                invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
+                rmLines.write("{}\n".format(lineNo))
+            else:
+                provFile.write(PLFline)
+        else:
+            blankPLF.write(timeInfo[0] + "\n")
+            rmLines.write("{}\n".format(lineNo))
+        # Now convert to PLF
+        lineNo += 1
+
+provFile.close()
+invalidPLF.close()
+blankPLF.close()
+rmLines.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
new file mode 100755
index 00000000000..29fbeebace6
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+stage=-2
+num_words_pocolm=110000
+prune_size=1000000
+
+. ./path_venv.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -euo pipefail
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+textdir=$1
+pocolm_dir=$2
+
+
+if [ $stage -le -2 ];then
+    if [ -e "$textdir"/unigram_weights ]; then
+	rm "$textdir"/unigram_weights
+    fi
+
+    if [ -e "$pocolm_dir" ]; then
+	rm -r "$pocolm_dir"
+    fi
+
+    bash local/pocolm_cust.sh  --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \
+	                       --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+fi
+
+if [ $stage -le -1 ];then
+  prune_lm_dir.py --target-num-ngrams=${prune_size} --max-memory=8G "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned
+  format_arpa_lm.py --max-memory=8G "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned | gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3.pocolm_pruned_${prune_size}.arpa.gz
+fi
+
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py
new file mode 100755
index 00000000000..3f6444da294
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+
+import os
+import sys
+
+files = [
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/exp/tri5a/decode_test/oracle/oracle.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/exp/tri5a/decode_test/oracle/oracle.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/exp/tri5a/decode_test/oracle/oracle.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/exp/tri5a/decode_test/oracle/oracle.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/exp/tri5a/decode_test/oracle/oracle.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/exp/tri5a/decode_test/oracle/oracle.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/exp/tri5a/decode_test/oracle/oracle.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/exp/tri5a/decode_test/oracle/oracle.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/exp/tri5a/decode_test/oracle/oracle.tra'),
+open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/exp/tri5a/decode_test/oracle/oracle.tra')]
+
+def findTranscription(timeDetail):
+
+    for file1 in files:
+        file1.seek(0,0)
+        for line in file1:
+            lineComp = line.split()
+            if lineComp[0] == timeDetail:
+                return " ".join(lineComp[1:])
+    # No result found
+    return -1
+
+
+wordsFile = open('exp/tri5a/graph/words.txt')
+words = {}
+
+# Extract word list
+for line in wordsFile:
+    lineComp = line.split()
+    words[int(lineComp[1])] = lineComp[0].strip()
+
+# Now read list of files in conversations
+fileList = []
+#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev')
+conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train')
+for line in conversationList:
+    line = line.strip()
+    line = line[:-4]
+    fileList.append(line)
+
+# IN what order were the conversations added to the spanish files?
+# TODO: Make sure they match the order in which these english files are being written
+
+# Now get timing information to concatenate the ASR outputs
+if not os.path.exists('exp/tri5a/one-best/train'):
+    os.makedirs('exp/tri5a/one-best/train')
+
+#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+')
+provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train.oracle', 'w+')
+for item in fileList:
+    timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es')
+    newFile = open('exp/tri5a/one-best/train/' + item + '.es', 'w+')
+    for line in timingFile:
+        timeInfo = line.split()
+        mergedTranslation = ""
+        for timeDetail in timeInfo:
+            #Locate this in ASR dev/test, this is going to be very slow
+            tmp = findTranscription(timeDetail)
+            if tmp != -1:
+                mergedTranslation = mergedTranslation + " " + tmp
+        mergedTranslation = mergedTranslation.strip()
+        transWords = [words[int(x)] for x in mergedTranslation.split()]
+        newFile.write(" ".join(transWords) + "\n")
+        provFile.write(" ".join(transWords) + "\n")
+    newFile.close()
+provFile.close()
+
+
+
+
+
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter b/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter
new file mode 100755
index 00000000000..4fce42945b3
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter
@@ -0,0 +1,5 @@
+#!/bin/sed -f
+s:\[laughter\]::g
+s:\[noise\]::g
+s:\[oov\]::g
+s:<UNK>::g
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
new file mode 100755
index 00000000000..2fc3de37406
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
@@ -0,0 +1,13 @@
+export KALDI_ROOT=`pwd`/../../../../kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LD_LIBRARY_PATH=/home/dpovey/libs
+
+export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
+export PATH=$SPARROWHAWK_ROOT/bin:$PATH
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh b/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh
new file mode 100755
index 00000000000..80edbbaf69a
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh
@@ -0,0 +1,13 @@
+export KALDI_ROOT=`pwd`/../../../../kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LD_LIBRARY_PATH=/home/dpovey/libs
+
+export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
+export PATH=$SPARROWHAWK_ROOT/bin:$PATH
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+source ~/anaconda/bin/activate py36
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/rnnlm b/egs/fisher_callhome_spanish/s5_gigaword/rnnlm
new file mode 120000
index 00000000000..fb754622d5e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/rnnlm
@@ -0,0 +1 @@
+../../wsj/s5/rnnlm
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
new file mode 100755
index 00000000000..5f7068072f3
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -0,0 +1,299 @@
+#!/bin/bash
+#
+# Copyright 2018  Nagendra Goel, Saikiran Valluri  Apache 2.0
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+# Recipe for Fisher/Callhome-Spanish
+
+stage=-1
+lmstage=-2
+train_sgmm2=false
+
+# call the next line with the directory where the Spanish Fisher data is
+# (the values below are just an example).
+sfisher_speech=/export/corpora/LDC/LDC2010S01
+sfisher_transcripts=/export/corpora/LDC/LDC2010T04
+spanish_lexicon=/export/corpora/LDC/LDC96L16
+split=local/splits/split_fisher
+
+callhome_speech=/export/corpora/LDC/LDC96S35
+callhome_transcripts=/export/corpora/LDC/LDC96T17
+split_callhome=local/splits/split_callhome
+
+gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data # Path to the download of Gigaword data
+rnnlm_workdir=/export/c03/svalluri/workdir_rnnlm  # Work path for entire Gigaword LM and text processing, should be 
+                                                  # large free spae and easy IO access.
+mfccdir=`pwd`/mfcc
+
+. ./cmd.sh
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+set -eou pipefail
+
+if [ $stage -le -1 ]; then
+  local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
+  local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
+
+  # The lexicon is created using the LDC spanish lexicon, the words from the
+  # fisher spanish corpus. Additional (most frequent) words are added from the
+  # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
+  # wordlist is downloaded if it is not available.
+  local/fsp_prepare_dict.sh $spanish_lexicon
+
+  # Added c,j, v to the non silences phones manually
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+  # Make sure that you do not use your test and your dev sets to train the LM
+  # Some form of cross validation is possible where you decode your dev/set based on an
+  # LM that is trained on  everything but that that conversation
+  # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
+  # to get the numbers. Depending on your needs, you might have to change the size of
+  # the splits within that file. The default paritions are based on the Kaldi + Joshua
+  # requirements which means that I have very large dev and test sets
+  local/fsp_train_lms.sh $split
+  local/fsp_create_test_lang.sh
+
+  utils/fix_data_dir.sh data/local/data/train_all
+
+  steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
+
+  utils/fix_data_dir.sh data/local/data/train_all
+  utils/validate_data_dir.sh data/local/data/train_all
+
+  cp -r data/local/data/train_all data/train_all
+
+  # For the CALLHOME corpus
+  utils/fix_data_dir.sh data/local/data/callhome_train_all
+
+  steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/callhome_train_all exp/make_mfcc/callhome_train_all $mfccdir || exit 1;
+
+  utils/fix_data_dir.sh data/local/data/callhome_train_all
+  utils/validate_data_dir.sh data/local/data/callhome_train_all
+
+  cp -r data/local/data/callhome_train_all data/callhome_train_all
+
+  # Creating data partitions for the pipeline
+  # We need datasets for both the ASR and SMT system
+  # We have 257455 utterances left, so the partitions are roughly as follows
+  # ASR Train : 100k utterances
+  # ASR Tune : 17455 utterances
+  # ASR Eval : 20k utterances
+  # MT Train : 100k utterances
+  # MT Tune : Same as the ASR eval set (Use the lattices from here)
+  # MT Eval : 20k utterances
+  # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
+  # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
+  # As noted above, the LM has not been trained on the dev and the test sets.
+  #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
+  #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
+  #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
+  #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
+  #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
+  #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
+  #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
+  #rm -r data/dev_and_test
+  #rm -r data/asr_dev_and_test
+  #rm -r data/mt_train_and_test
+
+  local/create_splits.sh $split
+  local/callhome_create_splits.sh $split_callhome
+fi
+
+if [ $stage -le 0 ]; then
+    mkdir -p "$rnnlm_workdir"/gigaword_rawtext
+    local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath"  "$rnnlm_workdir"/flattened_gigaword_corpus 24
+    cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt
+    local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/  \
+			   "$rnnlm_workdir"/normalised_gigaword_corpus/
+    mkdir -p "$rnnlm_workdir"/text_lm
+    cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt
+    cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt  # For RNNLM and POCOLM training we use dev2/text as dev file.
+    cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+fi
+
+
+if [ $stage -le 1 ]; then
+    num_words_pocolm=110000
+    local/train_pocolm.sh --stage $lmstage --num-words-pocolm 110000 "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
+    cat "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt > "$rnnlm_workdir"/rnnlm_wordlist.txt
+    cut -f 1 -d " " data/lang/words.txt >> "$rnnlm_workdir"/rnnlm_wordlist.txt
+    cat "$rnnlm_workdir"/rnnlm_wordlist.txt | sort | uniq > "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq
+    local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/100000_3.pocolm \
+		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq  --text "$rnnlm_workdir"/text_lm --text-dir "$rnnlm_workdir"/text_lm
+fi
+
+if [ $stage -le 2 ]; then
+  # Now compute CMVN stats for the train, dev and test subsets
+  steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
+  steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
+  steps/compute_cmvn_stats.sh data/dev2 exp/make_mfcc/dev2 $mfccdir
+  #steps/compute_cmvn_stats.sh data/mt_train exp/make_mfcc/mt_train $mfccdir
+  #steps/compute_cmvn_stats.sh data/mt_test exp/make_mfcc/mt_test $mfccdir
+
+  #n=$[`cat data/train_all/segments | wc -l` - 158126]
+  #utils/subset_data_dir.sh --last data/train_all $n data/train
+  steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
+
+  steps/compute_cmvn_stats.sh data/callhome_dev exp/make_mfcc/callhome_dev $mfccdir
+  steps/compute_cmvn_stats.sh data/callhome_test exp/make_mfcc/callhome_test $mfccdir
+  steps/compute_cmvn_stats.sh data/callhome_train exp/make_mfcc/callhome_train $mfccdir
+
+  # Again from Dan's recipe : Reduced monophone training data
+  # Now-- there are 1.6 million utterances, and we want to start the monophone training
+  # on relatively short utterances (easier to align), but not only the very shortest
+  # ones (mostly uh-huh).  So take the 100k shortest ones, and then take 10k random
+  # utterances from those.
+
+  utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort
+  utils/subset_data_dir.sh  data/train_100kshort 10000 data/train_10k
+  utils/data/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup
+  utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k
+  utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
+    data/train_10k_nodup data/lang exp/mono0a
+
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1;
+
+
+  (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
+  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+    exp/tri1/graph data/dev exp/tri1/decode_dev)&
+
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
+    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
+   )&
+fi
+
+if [ $stage -le 4 ]; then
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1;
+
+# Train tri3a, which is LDA+MLLT, on 100k data.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+   --splice-opts "--left-context=3 --right-context=3" \
+   3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1;
+  (
+    utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
+  )&
+fi
+
+if [ $stage -le 5 ]; then
+# Next we'll use fMLLR and train with SAT (i.e. on
+# fMLLR features)
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;
+
+  steps/train_sat.sh  --cmd "$train_cmd" \
+    4000 60000 data/train_100k data/lang exp/tri3a_ali  exp/tri4a || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri4a/graph data/dev exp/tri4a/decode_dev
+)&
+
+
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
+
+# Reduce the number of gaussians
+  steps/train_sat.sh  --cmd "$train_cmd" \
+    5000 120000 data/train data/lang exp/tri4a_ali  exp/tri5a || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/dev exp/tri5a/decode_dev
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/test exp/tri5a/decode_test
+
+  # Decode CALLHOME
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train
+    ) &
+
+
+   steps/align_fmllr.sh \
+     --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \
+     data/train data/lang exp/tri5a exp/tri5a_ali
+fi
+
+if $train_sgmm2; then
+
+steps/train_ubm.sh \
+  --cmd "$train_cmd" 750 \
+  data/train data/lang exp/tri5a_ali exp/ubm5
+
+steps/train_sgmm2.sh \
+  --cmd "$train_cmd" 5000 18000 \
+  data/train data/lang exp/tri5a_ali exp/ubm5/final.ubm exp/sgmm5
+
+utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
+
+(
+  steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \
+    --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \
+   exp/sgmm5/graph data/dev exp/sgmm5/decode_dev
+)&
+
+steps/align_sgmm2.sh \
+  --nj 32  --cmd "$train_cmd" --transform-dir exp/tri5a_ali \
+  --use-graphs true --use-gselect true \
+  data/train data/lang exp/sgmm5 exp/sgmm5_ali
+
+steps/make_denlats_sgmm2.sh \
+  --nj 32 --sub-split 32 --num-threads 4 \
+  --beam 10.0 --lattice-beam 6 --cmd "$decode_cmd" --transform-dir exp/tri5a_ali \
+  data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats
+
+steps/train_mmi_sgmm2.sh \
+  --cmd "$train_cmd" --drop-frames true --transform-dir exp/tri5a_ali --boost 0.1 \
+  data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
+  exp/sgmm5_mmi_b0.1
+
+(
+utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
+steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \
+  --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 12"\
+ exp/tri5a/graph data/dev exp/tri5a/decode_dev
+utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
+steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \
+  --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \
+ exp/sgmm5/graph data/dev exp/sgmm5/decode_dev
+for iter in 1 2 3 4; do
+  decode=exp/sgmm5_mmi_b0.1/decode_dev_it$iter
+  mkdir -p $decode
+  steps/decode_sgmm2_rescore.sh  \
+    --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5a/decode_dev \
+    data/lang_test data/dev/  exp/sgmm5/decode_dev $decode
+done
+) &
+fi
+
+wait;
+
+if [ $stage -le 6 ]; then
+  local/chain/run_tdnn_1g.sh || exit 1;
+fi
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/steps b/egs/fisher_callhome_spanish/s5_gigaword/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/utils b/egs/fisher_callhome_spanish/s5_gigaword/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file

From e8aecbb584d05eb0b4cad22d3d57a59b0a20a8d5 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Tue, 19 Feb 2019 10:47:15 +0530
Subject: [PATCH 02/49] Some bug fixes

---
 egs/fisher_callhome_spanish/s5_gigaword/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 5f7068072f3..89e8fbd434b 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -118,8 +118,8 @@ if [ $stage -le 1 ]; then
     cat "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt > "$rnnlm_workdir"/rnnlm_wordlist.txt
     cut -f 1 -d " " data/lang/words.txt >> "$rnnlm_workdir"/rnnlm_wordlist.txt
     cat "$rnnlm_workdir"/rnnlm_wordlist.txt | sort | uniq > "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq
-    local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/100000_3.pocolm \
-		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq  --text "$rnnlm_workdir"/text_lm --text-dir "$rnnlm_workdir"/text_lm
+    local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
+		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq  --text-dir "$rnnlm_workdir"/text_lm
 fi
 
 if [ $stage -le 2 ]; then

From ece34bd064bfbdcae7b655552057469c5d47b0b2 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Tue, 19 Feb 2019 10:48:44 +0530
Subject: [PATCH 03/49] Update rnnlm.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
index aa06fdbb293..3850910f312 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
@@ -21,7 +21,6 @@ lstm_rpd=256
 lstm_nrpd=256
 stage=0
 train_stage=-30
-text=Spanish_gigawrd/text_lm
 text_dir=Spanish_gigawrd/text_lm
 
 . ./cmd.sh
@@ -30,7 +29,7 @@ text_dir=Spanish_gigawrd/text_lm
 mkdir -p $dir/config
 set -e
 
-for f in $text/dev.txt; do
+for f in $text_dir/dev.txt; do
     [ ! -f $f ] && \
 	echo "$0: expected file $f to exist;" && exit 1
 done

From 0c4fe470684751a54e4def8600dde847b8507cd5 Mon Sep 17 00:00:00 2001
From: saikiran valluri <svalluri@c03.clsp.jhu.edu>
Date: Tue, 19 Feb 2019 01:27:47 -0500
Subject: [PATCH 04/49] Combining lexicon words with pocolm wordslist for RNNLM
 training

---
 .../s5_gigaword/local/get_rnnlm_wordlist.py   | 32 ++++++++++++++
 .../s5_gigaword/local/rnnlm.sh                |  3 +-
 .../s5_gigaword/run.sh                        | 42 ++++---------------
 3 files changed, 42 insertions(+), 35 deletions(-)
 create mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py
new file mode 100755
index 00000000000..d6ddfbecc14
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 4:
+    print( "Usage: python get_rnnlm_wordlist.py <ASR lexicon words> <POCOLM wordslist> <RNNLM wordslist output>")
+    sys.exit()
+
+lexicon_words = open(sys.argv[1], 'r')
+pocolm_words = open(sys.argv[2], 'r')
+rnnlm_wordsout = open(sys.argv[3], 'w')
+
+line_count=0
+lexicon=[]
+
+for line in lexicon_words:
+    lexicon.append(line.split()[0])
+    rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+    line_count = line_count + 1
+
+for line in pocolm_words:
+    if not line.split()[0] in lexicon:
+        rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+        line_count = line_count + 1
+
+lexicon_words.close()
+pocolm_words.close()
+rnnlm_wordsout.close()
+        
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
index aa06fdbb293..3850910f312 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
@@ -21,7 +21,6 @@ lstm_rpd=256
 lstm_nrpd=256
 stage=0
 train_stage=-30
-text=Spanish_gigawrd/text_lm
 text_dir=Spanish_gigawrd/text_lm
 
 . ./cmd.sh
@@ -30,7 +29,7 @@ text_dir=Spanish_gigawrd/text_lm
 mkdir -p $dir/config
 set -e
 
-for f in $text/dev.txt; do
+for f in $text_dir/dev.txt; do
     [ ! -f $f ] && \
 	echo "$0: expected file $f to exist;" && exit 1
 done
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 5f7068072f3..80c0debfb12 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -19,9 +19,8 @@ callhome_speech=/export/corpora/LDC/LDC96S35
 callhome_transcripts=/export/corpora/LDC/LDC96T17
 split_callhome=local/splits/split_callhome
 
-gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data # Path to the download of Gigaword data
-rnnlm_workdir=/export/c03/svalluri/workdir_rnnlm  # Work path for entire Gigaword LM and text processing, should be 
-                                                  # large free spae and easy IO access.
+gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data
+rnnlm_workdir=/export/c03/svalluri/workdir_rnnlm
 mfccdir=`pwd`/mfcc
 
 . ./cmd.sh
@@ -31,8 +30,9 @@ if [ -f path.sh ]; then . ./path.sh; fi
 set -eou pipefail
 
 if [ $stage -le -1 ]; then
-  local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
-  local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
+#  local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
+
+#  local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
 
   # The lexicon is created using the LDC spanish lexicon, the words from the
   # fisher spanish corpus. Additional (most frequent) words are added from the
@@ -72,29 +72,6 @@ if [ $stage -le -1 ]; then
 
   cp -r data/local/data/callhome_train_all data/callhome_train_all
 
-  # Creating data partitions for the pipeline
-  # We need datasets for both the ASR and SMT system
-  # We have 257455 utterances left, so the partitions are roughly as follows
-  # ASR Train : 100k utterances
-  # ASR Tune : 17455 utterances
-  # ASR Eval : 20k utterances
-  # MT Train : 100k utterances
-  # MT Tune : Same as the ASR eval set (Use the lattices from here)
-  # MT Eval : 20k utterances
-  # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
-  # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
-  # As noted above, the LM has not been trained on the dev and the test sets.
-  #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
-  #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
-  #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
-  #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
-  #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
-  #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
-  #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
-  #rm -r data/dev_and_test
-  #rm -r data/asr_dev_and_test
-  #rm -r data/mt_train_and_test
-
   local/create_splits.sh $split
   local/callhome_create_splits.sh $split_callhome
 fi
@@ -115,11 +92,10 @@ fi
 if [ $stage -le 1 ]; then
     num_words_pocolm=110000
     local/train_pocolm.sh --stage $lmstage --num-words-pocolm 110000 "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
-    cat "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt > "$rnnlm_workdir"/rnnlm_wordlist.txt
-    cut -f 1 -d " " data/lang/words.txt >> "$rnnlm_workdir"/rnnlm_wordlist.txt
-    cat "$rnnlm_workdir"/rnnlm_wordlist.txt | sort | uniq > "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq
-    local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/100000_3.pocolm \
-		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist.txt.uniq  --text "$rnnlm_workdir"/text_lm --text-dir "$rnnlm_workdir"/text_lm
+    local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
+				"$rnnlm_workdir"/rnnlm_wordlist
+    local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
+		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm
 fi
 
 if [ $stage -le 2 ]; then

From 1439b0dd9d0d2ae527e0ddd14c6a4b39c7bd7075 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Sun, 24 Feb 2019 01:54:54 -0500
Subject: [PATCH 05/49] Integrated the 2 stage scientific method POCOLM
 training for Gigaword corpus

---
 .../s5_gigaword/cmd.sh                        |  2 +-
 .../local/get_unigram_weights_vocab.py        | 33 +++++++++++++++++++
 .../s5_gigaword/local/pocolm_cust.sh          |  7 ++--
 .../s5_gigaword/local/train_pocolm.sh         | 26 +++++++++++----
 .../s5_gigaword/run.sh                        |  5 ++-
 5 files changed, 62 insertions(+), 11 deletions(-)
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh
index 0511bd2bbb0..db97f1fbc6f 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="retry.pl queue.pl"
+export train_cmd="retry.pl queue.pl --mem 8G"
 export decode_cmd="retry.pl queue.pl --mem 8G"
 export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py
new file mode 100644
index 00000000000..43cf8392167
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 3:
+    print("Usage : python . <pocolmmodelpat> <unigram weights outfile>")
+    print("      Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.")
+    sys.exit()
+ 
+pocolmdir=sys.argv[1]
+unigramwts=open(sys.argv[2], 'w')
+
+names = open(pocolmdir+"/names", 'r')
+metaparams = open(pocolmdir+"/metaparameters", 'r')
+
+name_mapper={}
+for line in names:
+    fields=line.split()
+    name_mapper[fields[0]] = fields[1]
+    
+lns = metaparams.readlines()
+for lineno in range(len(name_mapper.keys())):
+    line = lns[lineno]
+    fileid = line.split()[0].split("_")[-1]
+    weight = line.split()[1]
+    unigramwts.write(name_mapper[fileid] + "  " + weight + "\n")
+
+names.close()
+unigramwts.close()
+metaparams.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
index a3b2d77d860..c6642f6fcf4 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
@@ -13,6 +13,8 @@ export PATH=$PATH:$POCOLM_ROOT/scripts
 
 wordlist=None
 num_word=100000
+pocolm_stage=2
+ngram_order=3
 lm_dir=
 arpa_dir=
 textdir=
@@ -55,7 +57,7 @@ limit_unk_history_opt=
 # un-comment the following line
 #limit_unk_history_opt="--limit-unk-history=true"
 
-for order in 3; do
+for order in ${ngram_order}; do
   # decide on the vocabulary.
   # Note: you'd use --wordlist if you had a previously determined word-list
   # that you wanted to use.
@@ -72,6 +74,7 @@ for order in 3; do
               --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \
               ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir}
 
+  if [ $pocolm_stage -eq 2 ];then
   mkdir -p ${arpa_dir}
   format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz
 
@@ -93,7 +96,7 @@ for order in 3; do
   get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
 
   format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz
-
+  fi
 done
 
 # (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 )
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
index 29fbeebace6..8ceb08f281a 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
@@ -17,22 +17,34 @@ textdir=$1
 pocolm_dir=$2
 
 
-if [ $stage -le -2 ];then
+if [ $stage -le -2 ]; then
+    echo "\n\n"
+    echo " POCOLM experiment : Runnning STAGE 1 : 2-gram Pocolm general closed vocabulary model"
+    echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....."
+    echo "\n\n"
     if [ -e "$textdir"/unigram_weights ]; then
 	rm "$textdir"/unigram_weights
     fi
-
     if [ -e "$pocolm_dir" ]; then
 	rm -r "$pocolm_dir"
     fi
+    
+    bash local/pocolm_cust.sh  --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \
+	 --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
 
-    bash local/pocolm_cust.sh  --num-word "$num_words_pocolm" --lm-dir "$pocolm_dir"/lm \
-	                       --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
 fi
-
+    
 if [ $stage -le -1 ];then
-  prune_lm_dir.py --target-num-ngrams=${prune_size} --max-memory=8G "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned
-  format_arpa_lm.py --max-memory=8G "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned | gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3.pocolm_pruned_${prune_size}.arpa.gz
+    echo "\n\n"
+    echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....."
+    echo "\n\n"
+
+    echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done
+    python local/get_unigramwts.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights
+    bash local/pocolm_cust.sh  --num-word "$num_words_pocolm"  --lm-dir "$pocolm_dir"/lm \
+	                       --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+    
+
 fi
 
 
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 80c0debfb12..6e2ee9d4f25 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -20,7 +20,7 @@ callhome_transcripts=/export/corpora/LDC/LDC96T17
 split_callhome=local/splits/split_callhome
 
 gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data
-rnnlm_workdir=/export/c03/svalluri/workdir_rnnlm
+rnnlm_workdir=/export/c03/svalluri/workdir_pocolm_2stage
 mfccdir=`pwd`/mfcc
 
 . ./cmd.sh
@@ -94,6 +94,9 @@ if [ $stage -le 1 ]; then
     local/train_pocolm.sh --stage $lmstage --num-words-pocolm 110000 "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
     local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
 				"$rnnlm_workdir"/rnnlm_wordlist
+fi
+    
+if [ $stage -le 2 ]; then
     local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
 		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm
 fi

From 8ad0e0130c011fef22f583d1ca60e0c0d6f856a0 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Tue, 26 Feb 2019 05:17:14 +0000
Subject: [PATCH 06/49] Update train_pocolm.sh

---
 .../s5_gigaword/local/train_pocolm.sh                | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
index 8ceb08f281a..c8adb79383e 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
@@ -18,10 +18,10 @@ pocolm_dir=$2
 
 
 if [ $stage -le -2 ]; then
-    echo "\n\n"
-    echo " POCOLM experiment : Runnning STAGE 1 : 2-gram Pocolm general closed vocabulary model"
+    echo "****"
+    echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model"
     echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....."
-    echo "\n\n"
+    echo "****"
     if [ -e "$textdir"/unigram_weights ]; then
 	rm "$textdir"/unigram_weights
     fi
@@ -35,12 +35,12 @@ if [ $stage -le -2 ]; then
 fi
     
 if [ $stage -le -1 ];then
-    echo "\n\n"
+    echo "********"
     echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....."
-    echo "\n\n"
+    echo "********"
 
     echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done
-    python local/get_unigramwts.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights
+    python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights
     bash local/pocolm_cust.sh  --num-word "$num_words_pocolm"  --lm-dir "$pocolm_dir"/lm \
 	                       --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
     

From f856ac2c4cd0da3c7df4aab65a1eace387dd60b7 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Wed, 27 Feb 2019 15:36:32 +0530
Subject: [PATCH 07/49] Update run.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/run.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 6e2ee9d4f25..bd553fc720e 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -88,10 +88,9 @@ if [ $stage -le 0 ]; then
     cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
 fi
 
-
+num_words_pocolm=110000
 if [ $stage -le 1 ]; then
-    num_words_pocolm=110000
-    local/train_pocolm.sh --stage $lmstage --num-words-pocolm 110000 "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
+    local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
     local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
 				"$rnnlm_workdir"/rnnlm_wordlist
 fi

From 684f029e77da3426c59e3b4106ce6b45160de088 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Thu, 28 Feb 2019 11:57:29 +0000
Subject: [PATCH 08/49] Text cleaning script for splitting Abbreviation words
 added

---
 .../s5_gigaword/local/clean_abbrevs_text.py   | 33 +++++++++++++++++++
 .../s5_gigaword/local/run_norm.sh             |  3 ++
 scripts/rnnlm/choose_features.py              | 12 ++-----
 scripts/rnnlm/get_best_model.py               | 28 ++++++++--------
 scripts/rnnlm/get_embedding_dim.py            |  4 +--
 scripts/rnnlm/get_num_splits.sh               |  2 +-
 scripts/rnnlm/get_special_symbol_opts.py      |  8 ++---
 scripts/rnnlm/get_unigram_probs.py            | 18 ++++------
 scripts/rnnlm/get_vocab.py                    | 11 +++----
 scripts/rnnlm/get_word_features.py            | 15 ++++-----
 scripts/rnnlm/lmrescore.sh                    |  6 ----
 scripts/rnnlm/lmrescore_nbest.sh              |  4 +--
 scripts/rnnlm/lmrescore_pruned.sh             | 17 +++-------
 scripts/rnnlm/prepare_rnnlm_dir.sh            |  9 ++---
 scripts/rnnlm/prepare_split_data.py           | 13 +++-----
 scripts/rnnlm/rnnlm_cleanup.py                |  2 +-
 scripts/rnnlm/show_word_features.py           | 19 +++--------
 scripts/rnnlm/train_rnnlm.sh                  |  2 +-
 scripts/rnnlm/validate_features.py            |  7 ++--
 scripts/rnnlm/validate_text_dir.py            | 11 +++----
 scripts/rnnlm/validate_word_features.py       | 11 +++----
 21 files changed, 104 insertions(+), 131 deletions(-)
 create mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
new file mode 100644
index 00000000000..22fc54f18cc
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.,
+
+import os, sys
+import re
+import codecs
+
+if len(sys.argv) < 3:
+  print("Usage : python clean_abbrevs_text.py <Input text> <output text>")
+  print("        Processes the text before text normalisation to convert uppercase words as space separated letters")
+  sys.exit()
+
+inputfile=codecs.open(sys.argv[1], encoding='utf-8')
+outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w+')
+
+for line in inputfile:
+  words = line.split()
+  textout = ""
+  wordcnt = 0
+  for word in words:
+    if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word) and wordcnt>0:
+      print(word)
+      word = re.sub('\'?s', 's', word)
+      textout = textout + " ".join(word) + " "
+    else:
+      textout = textout + word + " "
+      wordcnt = wordcnt + 1
+  outputfile.write(textout.strip()+ '\n')
+
+inputfile.close()
+outputfile.close() 
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh
index 4a26f6857b8..f88fecc815c 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh
@@ -24,7 +24,10 @@ for i in "${punctuation_symbols[@]}"; do
     num_syms=$((num_syms+1))
 done
 mkdir -p $dir/normalize/$job
+local/clean_abbrevs_text.py $data/$job $data/"$job"_processed
+mv $data/"$job"_processed $data/$job
 echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh
+ 
 bash $dir/normalize/$job/substitute.sh | \
     sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \
     sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç'  > $dir/normalize/$job/text
diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py
index c6621e04494..799f6b6dcc8 100755
--- a/scripts/rnnlm/choose_features.py
+++ b/scripts/rnnlm/choose_features.py
@@ -10,12 +10,6 @@
 from collections import defaultdict
 sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
-# because this script splits inside words, we cannot use latin-1; we actually need to know what 
-# what the encoding is.  By default we make this utf-8; to handle encodings that are not compatible
-# with utf-8 (e.g. gbk), we'll eventually have to make the encoding an option to this script.
-
-import re
-tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. "
                                              "To be more specific, it chooses the set of features-- you compute "
@@ -90,9 +84,9 @@
 #  and 'wordlist' is a list indexed by integer id, that returns the string-valued word.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="utf-8") as f:
+    with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -121,7 +115,7 @@ def read_unigram_probs(unigram_probs_file):
     unigram_probs = []
     with open(unigram_probs_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             idx = int(fields[0])
             if idx >= len(unigram_probs):
diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py
index 333ed8dbfc7..45487b18b0c 100755
--- a/scripts/rnnlm/get_best_model.py
+++ b/scripts/rnnlm/get_best_model.py
@@ -3,14 +3,14 @@
 # Copyright  2017  Johns Hopkins University (author: Daniel Povey)
 # License: Apache 2.0.
 
+import os
 import argparse
-import glob
-import re
 import sys
+import re
 
 parser = argparse.ArgumentParser(description="Works out the best iteration of RNNLM training "
-                                             "based on dev-set perplexity, and prints the number corresponding "
-                                             "to that iteration",
+                                 "based on dev-set perplexity, and prints the number corresponding "
+                                 "to that iteration",
                                  epilog="E.g. " + sys.argv[0] + " exp/rnnlm_a",
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
@@ -19,9 +19,10 @@
 
 args = parser.parse_args()
 
-num_iters = None
+
+num_iters=None
 try:
-    with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f:
+    with open(args.rnnlm_dir + "/info.txt", encoding="utf-8") as f:
         for line in f:
             a = line.split("=")
             if a[0] == "num_iters":
@@ -35,15 +36,15 @@
     sys.exit(sys.argv[0] + ": could not get num_iters from {0}/info.txt".format(
         args.rnnlm_dir))
 
-best_objf = -2000
-best_iter = -1
-for i in range(1, num_iters):
+best_objf=-2000
+best_iter=-1
+for i in range(num_iters):
     this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i)
     try:
-        f = open(this_logfile, 'r', encoding='latin-1')
+        f = open(this_logfile, 'r', encoding='utf-8')
     except:
         sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile))
-    this_objf = -1000
+    this_objf=-1000
     for line in f:
         m = re.search('Overall objf .* (\S+)$', str(line))
         if m is not None:
@@ -52,10 +53,6 @@
             except Exception as e:
                 sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format(
                     this_logfile, line, str(e)))
-    # verify this iteration still has model files present
-    if len(glob.glob("{0}/{1}.raw".format(args.rnnlm_dir, i))) == 0:
-        # this iteration has log files, but model files have been cleaned up, skip it
-        continue
     if this_objf == -1000:
         print(sys.argv[0] + ": warning: could not parse objective function from {0}".format(
             this_logfile), file=sys.stderr)
@@ -66,4 +63,5 @@
 if best_iter == -1:
     sys.exit(sys.argv[0] + ": error: could not get best iteration.")
 
+
 print(str(best_iter))
diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py
index 63eaf307498..b6810ef2cbf 100755
--- a/scripts/rnnlm/get_embedding_dim.py
+++ b/scripts/rnnlm/get_embedding_dim.py
@@ -45,7 +45,7 @@
 left_context=0
 right_context=0
 for line in out_lines:
-    line = line.decode('latin-1')
+    line = line.decode('utf-8')
     m = re.search(r'input-node name=input dim=(\d+)', line)
     if m is not None:
         try:
@@ -101,4 +101,4 @@
              "nnet '{0}': {1} != {2}".format(
             args.nnet, input_dim, output_dim))
 
-print('{}'.format(input_dim))
+print(str(input_dim))
diff --git a/scripts/rnnlm/get_num_splits.sh b/scripts/rnnlm/get_num_splits.sh
index 974fd8bf204..93d1f7f169c 100755
--- a/scripts/rnnlm/get_num_splits.sh
+++ b/scripts/rnnlm/get_num_splits.sh
@@ -65,7 +65,7 @@ tot_with_multiplicities=0
 
 for f in $text/*.counts; do
   if [ "$f" != "$text/dev.counts" ]; then
-    this_tot=$(cat $f | awk '{tot += $2} END{printf("%d", tot)}')
+    this_tot=$(cat $f | awk '{tot += $2} END{print tot}')
     if ! [ $this_tot -gt 0 ]; then
       echo "$0: there were no counts in counts file $f" 1>&2
       exit 1
diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py
index 4310b116ad7..13fe497faf9 100755
--- a/scripts/rnnlm/get_special_symbol_opts.py
+++ b/scripts/rnnlm/get_special_symbol_opts.py
@@ -8,9 +8,6 @@
 import argparse
 import sys
 
-import re
-tab_or_space = re.compile('[ \t]+')
-
 parser = argparse.ArgumentParser(description="This script checks whether the special symbols "
                                  "appear in words.txt with expected values, if not, it will "
                                  "print out the options with correct value to stdout, which may look like "
@@ -28,10 +25,9 @@
 
 lower_ids = {}
 upper_ids = {}
-input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
+input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace')
 for line in input_stream:
-    fields = re.split(tab_or_space, line)
-    assert(len(fields) == 2)
+    fields = line.split()
     sym = fields[0]
     if sym in special_symbols:
         assert sym not in lower_ids
diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py
index ab3f9bb382f..32b01728ca3 100755
--- a/scripts/rnnlm/get_unigram_probs.py
+++ b/scripts/rnnlm/get_unigram_probs.py
@@ -7,9 +7,6 @@
 import argparse
 import sys
 
-import re
-tab_or_space = re.compile('[ \t]+')
-
 parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.",
                                  epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt "
                                         "--data-weights-file=exp/rnnlm/data_weights.txt data/rnnlm/data "
@@ -77,10 +74,10 @@ def get_all_data_sources_except_dev(text_dir):
 #                    value is a tuple (repeated_times_per_epoch, weight)
 def read_data_weights(weights_file, data_sources):
     data_weights = {}
-    with open(weights_file, 'r', encoding="latin-1") as f:
+    with open(weights_file, 'r', encoding="utf-8", errors='replace') as f:
         for line in f:
             try:
-                fields = re.split(tab_or_space, line)
+                fields = line.split()
                 assert len(fields) == 3
                 if fields[0] in data_weights:
                     raise Exception("duplicated data source({0}) specified in "
@@ -102,9 +99,9 @@ def read_data_weights(weights_file, data_sources):
 # return the vocab, which is a dict mapping the word to a integer id.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="latin-1") as f:
+    with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -131,11 +128,10 @@ def get_counts(data_sources, data_weights, vocab):
         if weight == 0.0:
             continue
 
-        with open(counts_file, 'r', encoding="latin-1") as f:
+        with open(counts_file, 'r', encoding="utf-8", errors='replace') as f:
             for line in f:
-                fields = re.split(tab_or_space, line)
-                if len(fields) != 2: print("Warning, should be 2 cols:", fields, line, file=sys.stderr);
-                assert(len(fields) == 2)
+                fields = line.split()
+                assert len(fields) == 2
                 word = fields[0]
                 count = fields[1]
                 if word not in vocab:
diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py
index 1502e915f9c..f290ef721c1 100755
--- a/scripts/rnnlm/get_vocab.py
+++ b/scripts/rnnlm/get_vocab.py
@@ -6,10 +6,7 @@
 import os
 import argparse
 import sys
-sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
-
-import re
-tab_or_space = re.compile('[ \t]+')
+sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
 parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts "
                                  "of words produced by get_unigram_counts.sh",
@@ -28,10 +25,10 @@
 # Add the count for every word in counts_file
 # the result is written into word_counts
 def add_counts(word_counts, counts_file):
-    with open(counts_file, 'r', encoding="latin-1") as f:
+    with open(counts_file, 'r', encoding="utf-8") as f:
         for line in f:
-            line = line.strip(" \t\r\n")
-            word_and_count = re.split(tab_or_space, line)
+            line = line.strip()
+            word_and_count = line.split()
             assert len(word_and_count) == 2
             if word_and_count[0] in word_counts:
                 word_counts[word_and_count[0]] += int(word_and_count[1])
diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py
index aeb7a3ec6ae..8bdb553b9c8 100755
--- a/scripts/rnnlm/get_word_features.py
+++ b/scripts/rnnlm/get_word_features.py
@@ -9,9 +9,6 @@
 import math
 from collections import defaultdict
 
-import re
-tab_or_space = re.compile('[ \t]+')
-
 parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, "
                                              "using features from rnnlm/choose_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " --unigram-probs=exp/rnnlm/unigram_probs.txt "
@@ -41,9 +38,9 @@
 # return the vocab, which is a dict mapping the word to a integer id.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="latin-1") as f:
+    with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -62,9 +59,9 @@ def read_vocab(vocab_file):
 # return a list of unigram_probs, indexed by word id
 def read_unigram_probs(unigram_probs_file):
     unigram_probs = []
-    with open(unigram_probs_file, 'r', encoding="latin-1") as f:
+    with open(unigram_probs_file, 'r', encoding="utf-8", errors='replace') as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             idx = int(fields[0])
             if idx >= len(unigram_probs):
@@ -103,9 +100,9 @@ def read_features(features_file):
     feats['min_ngram_order'] = 10000
     feats['max_ngram_order'] = -1
 
-    with open(features_file, 'r', encoding="latin-1") as f:
+    with open(features_file, 'r', encoding="utf-8", errors='replace') as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert(len(fields) in [3, 4, 5])
 
             feat_id = int(fields[0])
diff --git a/scripts/rnnlm/lmrescore.sh b/scripts/rnnlm/lmrescore.sh
index 9da22ae75a2..cd0cf793d8d 100755
--- a/scripts/rnnlm/lmrescore.sh
+++ b/scripts/rnnlm/lmrescore.sh
@@ -72,12 +72,6 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
   print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
   || exit 1;
 
-if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then
-  # the last word of the RNNLM word list is an added <brk> word
-  echo "$0: Word lists mismatch for lattices and RNNLM."
-  exit 1
-fi
-
 oldlm_command="fstproject --project_output=true $oldlm |"
 
 special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt)
diff --git a/scripts/rnnlm/lmrescore_nbest.sh b/scripts/rnnlm/lmrescore_nbest.sh
index 58b19b9fa79..f50a3c909f0 100755
--- a/scripts/rnnlm/lmrescore_nbest.sh
+++ b/scripts/rnnlm/lmrescore_nbest.sh
@@ -29,7 +29,7 @@ if [ $# != 6 ]; then
    echo "This version applies an RNNLM and mixes it with the LM scores"
    echo "previously in the lattices., controlled by the first parameter (rnnlm-weight)"
    echo ""
-   echo "Usage: $0 [options] <rnn-weight> <old-lang-dir> <rnn-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo "Usage: utils/rnnlmrescore.sh <rnn-weight> <old-lang-dir> <rnn-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
    echo "Main options:"
    echo "  --inv-acwt <inv-acwt>          # default 12.  e.g. --inv-acwt 17.  Equivalent to LM scale to use."
    echo "                                 # for N-best list generation... note, we'll score at different acwt's"
@@ -177,7 +177,7 @@ fi
 if [ $stage -le 6 ]; then
   echo "$0: invoking rnnlm/compute_sentence_scores.sh which calls rnnlm to get RNN LM scores."
   $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \
-    rnnlm/compute_sentence_scores.sh $rnndir $adir.JOB/temp \
+    local/rnnlm/compute_sentence_scores.sh $rnndir $adir.JOB/temp \
                                    $adir.JOB/words_text $adir.JOB/lmwt.rnn 
 fi
 if [ $stage -le 7 ]; then
diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh
index 9ba78415708..46ee5846424 100755
--- a/scripts/rnnlm/lmrescore_pruned.sh
+++ b/scripts/rnnlm/lmrescore_pruned.sh
@@ -16,18 +16,16 @@ max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-
                   # the same ngram history and this prevents the lattice from 
                   # exploding exponentially. Details of the n-gram approximation
                   # method are described in section 2.3 of the paper
-                  # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
-max_arcs=         # limit the max arcs in lattice while rescoring. E.g., 20000
+                  # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdm
+max_arcs=499         # limit the max arcs in lattice while rescoring. E.g., 20000
 
-acwt=0.1
-weight=0.5  # Interpolation weight for RNNLM.
+acwt=1
+weight=1  # Interpolation weight for RNNLM.
 normalize=false # If true, we add a normalization step to the output of the RNNLM
                 # so that it adds up to *exactly* 1. Note that this is not necessary
                 # as in our RNNLM setup, a properly trained network would automatically
                 # have its normalization term close to 1. The details of this
                 # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf
-lattice_prune_beam=4 # Beam used in pruned lattice composition
-                     # This option affects speed and how large the composed lattice may be
 
 # End configuration section.
 
@@ -75,12 +73,6 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
   print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
   || exit 1;
 
-if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then
-  # the last word of the RNNLM word list is an added <brk> word
-  echo "$0: Word lists mismatch for lattices and RNNLM."
-  exit 1
-fi
-
 normalize_opt=
 if $normalize; then
   normalize_opt="--normalize-probs=true"
@@ -105,7 +97,6 @@ cp $indir/num_jobs $outdir
 
 $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
   lattice-lmrescore-kaldi-rnnlm-pruned --lm-scale=$weight $special_symbol_opts \
-    --lattice-compose-beam=$lattice_prune_beam \
     --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt $max_arcs_opt \
     $carpa_option $oldlm $word_embedding "$rnnlm_dir/final.raw" \
     "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh
index e101822d983..d3ee44f1f95 100755
--- a/scripts/rnnlm/prepare_rnnlm_dir.sh
+++ b/scripts/rnnlm/prepare_rnnlm_dir.sh
@@ -23,7 +23,7 @@ if [ $# != 3 ]; then
   echo "Usage: $0 [options] <text-dir> <rnnlm-config-dir> <rnnlm-dir>"
   echo "Sets up the directory <rnnlm-dir> for RNNLM training as done by"
   echo "rnnlm/train_rnnlm.sh, and initializes the model."
-  echo " <text-dir> is as validated by rnnlm/validate_text_dir.py"
+  echo " <text-dir> is as validated by rnnlm/validate_data_dir.py"
   echo " <rnnlm-config-dir> is as validated by rnnlm/validate_config_dir.sh."
   exit 1
 fi
@@ -34,7 +34,6 @@ config_dir=$2
 dir=$3
 
 set -e
-. ./path.sh
 
 if [ $stage -le 0 ]; then
   echo "$0: validating input"
@@ -53,13 +52,9 @@ if [ $stage -le 1 ]; then
     echo "$0: copying config directory"
     mkdir -p $dir/config
     # copy expected things from $config_dir to $dir/config.
-    for f in words.txt data_weights.txt oov.txt xconfig; do
+    for f in words.txt features.txt data_weights.txt oov.txt xconfig; do
       cp $config_dir/$f $dir/config
     done
-    # features.txt is optional, check separately
-    if [ -f $config_dir/features.txt ]; then
-      cp $config_dir/features.txt $dir/config
-    fi
   fi
 
   rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt
diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py
index cceac48313e..9cc4f69d09f 100755
--- a/scripts/rnnlm/prepare_split_data.py
+++ b/scripts/rnnlm/prepare_split_data.py
@@ -8,9 +8,6 @@
 import argparse
 import sys
 
-import re
-tab_or_space = re.compile('[ \t]+')
-
 parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, "
                                  "for consumption by nnet3-get-egs.",
                                  epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt "
@@ -66,10 +63,10 @@ def get_all_data_sources_except_dev(text_dir):
 #                    value is a tuple (repeated_times_per_epoch, weight)
 def read_data_weights(weights_file, data_sources):
     data_weights = {}
-    with open(weights_file, 'r', encoding="latin-1") as f:
+    with open(weights_file, 'r', encoding="utf-8") as f:
         for line in f:
             try:
-                fields = re.split(tab_or_space, line)
+                fields = line.split()
                 assert len(fields) == 3
                 if fields[0] in data_weights:
                     raise Exception("duplicated data source({0}) specified in "
@@ -97,7 +94,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
     num_outputs = len(output_filehandles)
     n = 0
     try:
-        f = open(source_filename, 'r', encoding="latin-1")
+        f = open(source_filename, 'r', encoding="utf-8")
     except Exception as e:
         sys.exit(sys.argv[0] + ": failed to open file {0} for reading: {1} ".format(
             source_filename, str(e)))
@@ -124,7 +121,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
     os.makedirs(args.split_dir +  "/info")
 
 # set up the 'num_splits' file, which contains an integer.
-with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="latin-1") as f:
+with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="utf-8") as f:
     print(args.num_splits, file=f)
 
 # e.g. set temp_files = [ 'foo/1.tmp', 'foo/2.tmp', ..., 'foo/5.tmp' ]
@@ -136,7 +133,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
 temp_filehandles = []
 for fname in temp_files:
     try:
-        temp_filehandles.append(open(fname, 'w', encoding="latin-1"))
+        temp_filehandles.append(open(fname, 'w', encoding="utf-8"))
     except Exception as e:
         sys.exit(sys.argv[0] + ": failed to open file: " + str(e) +
                  ".. if this is a max-open-filehandles limitation, you may "
diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
index 40cbee7a496..6a304f7f4cb 100644
--- a/scripts/rnnlm/rnnlm_cleanup.py
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -69,7 +69,7 @@ def get_compute_prob_info(log_file):
     compute_prob_done = False
     # roughly based on code in get_best_model.py
     try:
-        f = open(log_file, "r", encoding="latin-1")
+        f = open(log_file, "r", encoding="utf-8")
     except:
         print(script_name + ": warning: compute_prob log not found for iteration " +
               str(iter) + ". Skipping",
diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py
index 89b134adaf9..89d84d53f3e 100755
--- a/scripts/rnnlm/show_word_features.py
+++ b/scripts/rnnlm/show_word_features.py
@@ -6,16 +6,7 @@
 import os
 import argparse
 import sys
-
-# The use of latin-1 encoding does not preclude reading utf-8.  latin-1 encoding
-# means "treat words as sequences of bytes", and it is compatible with utf-8
-# encoding as well as other encodings such as gbk, as long as the spaces are
-# also spaces in ascii (which we check).  It is basically how we emulate the
-# behavior of python before python3.
-sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
-
-import re
-tab_or_space = re.compile('[ \t]+')
+sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
 parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.",
                                  epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt "
@@ -36,9 +27,9 @@
 def read_feature_type_and_key(features_file):
     feat_types = {}
 
-    with open(features_file, 'r', encoding="latin-1") as f:
+    with open(features_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert(len(fields) in [2, 3, 4])
 
             feat_id = int(fields[0])
@@ -53,9 +44,9 @@ def read_feature_type_and_key(features_file):
 feat_type_and_key = read_feature_type_and_key(args.features_file)
 
 num_word_feats = 0
-with open(args.word_features_file, 'r', encoding="latin-1") as f:
+with open(args.word_features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert len(fields) % 2 == 1
 
         print(int(fields[0]), end='\t')
diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh
index 013e9a56c2f..f056d096120 100755
--- a/scripts/rnnlm/train_rnnlm.sh
+++ b/scripts/rnnlm/train_rnnlm.sh
@@ -41,7 +41,7 @@ use_gpu_for_diagnostics=false  # set true to use GPU for compute_prob_*.log
 # optional cleanup options
 cleanup=false  # add option --cleanup true to enable automatic cleanup of old models
 cleanup_strategy="keep_latest"  # determines cleanup strategy, use either "keep_latest" or "keep_best"
-cleanup_keep_iters=3  # number of iterations that will have their models retained
+cleanup_keep_iters=100  # number of iterations that will have their models retained
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 . utils/parse_options.sh
diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py
index 2a077da4758..a650092b086 100755
--- a/scripts/rnnlm/validate_features.py
+++ b/scripts/rnnlm/validate_features.py
@@ -7,9 +7,6 @@
 import argparse
 import sys
 
-import re
-tab_or_space = re.compile('[ \t]+')
-
 parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt",
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -24,7 +21,7 @@
 if not os.path.isfile(args.features_file):
     sys.exit(sys.argv[0] + ": Expected file {0} to exist".format(args.features_file))
 
-with open(args.features_file, 'r', encoding="latin-1") as f:
+with open(args.features_file, 'r', encoding="utf-8") as f:
     has_unigram = False
     has_length = False
     idx = 0
@@ -33,7 +30,7 @@
     final_feats = {}
     word_feats = {}
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert(len(fields) in [3, 4, 5])
 
         assert idx == int(fields[0])
diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py
index 903e720bdf4..d644d77911e 100755
--- a/scripts/rnnlm/validate_text_dir.py
+++ b/scripts/rnnlm/validate_text_dir.py
@@ -7,9 +7,6 @@
 import argparse
 import sys
 
-import re
-tab_or_space = re.compile('[ \t]+')
-
 parser = argparse.ArgumentParser(description="Validates data directory containing text "
                                  "files from one or more data sources, including dev.txt.",
                                  epilog="E.g. " + sys.argv[0] + " data/rnnlm/data",
@@ -40,7 +37,7 @@
 
 
 def check_text_file(text_file):
-    with open(text_file, 'r', encoding="latin-1") as f:
+    with open(text_file, 'r', encoding="utf-8") as f:
         found_nonempty_line = False
         lineno = 0
         if args.allow_internal_eos == 'true':
@@ -54,7 +51,7 @@ def check_text_file(text_file):
             lineno += 1
             if args.spot_check == 'true' and lineno > 10:
                 break
-            words = re.split(tab_or_space, line)
+            words = line.split()
             if len(words) != 0:
                 found_nonempty_line = True
                 for word in words:
@@ -76,9 +73,9 @@ def check_text_file(text_file):
     # with some kind of utterance-id
     first_field_set = set()
     other_fields_set = set()
-    with open(text_file, 'r', encoding="latin-1") as f:
+    with open(text_file, 'r', encoding="utf-8") as f:
         for line in f:
-            array = re.split(tab_or_space, line)
+            array = line.split()
             if len(array) > 0:
                 first_word = array[0]
                 if first_word in first_field_set or first_word in other_fields_set:
diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py
index 205b934ae1b..3dc9b23aa41 100755
--- a/scripts/rnnlm/validate_word_features.py
+++ b/scripts/rnnlm/validate_word_features.py
@@ -7,9 +7,6 @@
 import argparse
 import sys
 
-import re
-tab_or_space = re.compile('[ \t]+')
-
 parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt "
                                         "exp/rnnlm/word_feats.txt",
@@ -28,9 +25,9 @@
 unigram_feat_id = -1
 length_feat_id = -1
 max_feat_id = -1
-with open(args.features_file, 'r', encoding="latin-1") as f:
+with open(args.features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert(len(fields) in [3, 4, 5])
 
         feat_id = int(fields[0])
@@ -52,9 +49,9 @@
         if feat_id > max_feat_id:
             max_feat_id = feat_id
 
-with open(args.word_features_file, 'r', encoding="latin-1") as f:
+with open(args.word_features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert len(fields) > 0 and len(fields) % 2 == 1
         word_id = int(fields[0])
 

From 185da3aa1afd4b5dda886607a504b83394e8a13f Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Thu, 28 Feb 2019 18:17:53 +0530
Subject: [PATCH 09/49] Update clean_txt_dir.sh

---
 .../s5_gigaword/local/clean_txt_dir.sh                        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
index 56891328a89..0f06c037080 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
@@ -6,7 +6,7 @@
 stage=0
 nj=500
 
-. ./path.sh
+. ./path_venv.sh
 . ./cmd.sh
 . ./utils/parse_options.sh
 
@@ -38,7 +38,7 @@ if [ $stage -le 0 ]; then
   $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
     local/run_norm.sh \
     sparrowhawk_configuration.ascii_proto \
-    $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \
+    $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \
     $outdir/data \
     JOB \
     $outdir/sparrowhawk/

From cb393c81f678b704aa14de2b0d304ce4191a1026 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Thu, 28 Feb 2019 18:22:12 +0530
Subject: [PATCH 10/49] Update clean_txt_dir.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
index 0f06c037080..60269c0ab7e 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
@@ -6,7 +6,7 @@
 stage=0
 nj=500
 
-. ./path_venv.sh
+. ./path.sh
 . ./cmd.sh
 . ./utils/parse_options.sh
 

From 18a9cb6fe0927fbda13311e0bb4399c3e495e9e2 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Thu, 28 Feb 2019 18:23:25 +0530
Subject: [PATCH 11/49] Update train_pocolm.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
index c8adb79383e..964dd3bbcc5 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
@@ -4,7 +4,7 @@ stage=-2
 num_words_pocolm=110000
 prune_size=1000000
 
-. ./path_venv.sh
+. ./path.sh
 . ./cmd.sh
 . ./utils/parse_options.sh
 

From b023638357122da580ea41a8230b4e7ee2b5c69f Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Thu, 28 Feb 2019 18:23:55 +0530
Subject: [PATCH 12/49] Update pocolm_cust.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
index c6642f6fcf4..422db15937a 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
@@ -5,7 +5,7 @@
 
 set -euo pipefail
 
-. ./path_venv.sh
+. ./path.sh
 
 export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
 export PATH=$PATH:$POCOLM_ROOT/scripts

From 46550f0c598d50df636e5c181611566a7b211085 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Thu, 28 Feb 2019 13:04:51 +0000
Subject: [PATCH 13/49] Cosmetic fixes

---
 .../s5_gigaword/local/clean_abbrevs_text.py         |  7 ++++---
 .../s5_gigaword/local/get_unigram_weights_vocab.py  |  2 +-
 egs/fisher_callhome_spanish/s5_gigaword/path.sh     |  2 +-
 .../s5_gigaword/path_venv.sh                        | 13 -------------
 4 files changed, 6 insertions(+), 18 deletions(-)
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
index 22fc54f18cc..e5dfcd07a1c 100644
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 #    2018  Saikiran Valluri, GoVivace inc.,
@@ -13,7 +13,7 @@
   sys.exit()
 
 inputfile=codecs.open(sys.argv[1], encoding='utf-8')
-outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w+')
+outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w')
 
 for line in inputfile:
   words = line.split()
@@ -26,7 +26,8 @@
       textout = textout + " ".join(word) + " "
     else:
       textout = textout + word + " "
-      wordcnt = wordcnt + 1
+      if word.isalpha():
+        wordcnt = wordcnt + 1
   outputfile.write(textout.strip()+ '\n')
 
 inputfile.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py
index 43cf8392167..3ecd16772d7 100644
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 #    2018  Saikiran Valluri, GoVivace inc.
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
index 2fc3de37406..80edbbaf69a 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
@@ -10,4 +10,4 @@ export PATH=$SPARROWHAWK_ROOT/bin:$PATH
 export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
 
-
+source ~/anaconda/bin/activate py36
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh b/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh
deleted file mode 100755
index 80edbbaf69a..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/path_venv.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-export KALDI_ROOT=`pwd`/../../../../kaldi
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
-export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
-[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
-. $KALDI_ROOT/tools/config/common_path.sh
-export LD_LIBRARY_PATH=/home/dpovey/libs
-
-export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
-export PATH=$SPARROWHAWK_ROOT/bin:$PATH
-export LC_ALL=C.UTF-8
-export LANG=C.UTF-8
-
-source ~/anaconda/bin/activate py36

From ce3c7d7a2169113fb6bb7fd0b395250f4f123c12 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Thu, 28 Feb 2019 18:41:36 +0530
Subject: [PATCH 14/49] Update path.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/path.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
index 80edbbaf69a..d2c2937d81e 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
@@ -9,5 +9,3 @@ export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
 export PATH=$SPARROWHAWK_ROOT/bin:$PATH
 export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
-
-source ~/anaconda/bin/activate py36

From deeaaa76ce6a89fd500a917f0793eaab93d63356 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Fri, 1 Mar 2019 07:22:40 -0500
Subject: [PATCH 15/49] Bug fix in text normalisation script for gigaword
 corpus

---
 .../s5_gigaword/local/clean_abbrevs_text.py        | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
index e5dfcd07a1c..a6edc0f92c5 100644
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
@@ -20,14 +20,14 @@
   textout = ""
   wordcnt = 0
   for word in words:
-    if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word) and wordcnt>0:
-      print(word)
-      word = re.sub('\'?s', 's', word)
-      textout = textout + " ".join(word) + " "
+    if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word):
+      if wordcnt > 0:
+        word = re.sub('\'?s', 's', word)
+        textout = textout + " ".join(word) + " "
+      else:
+        textout = textout + word + " "
     else:
-      textout = textout + word + " "
-      if word.isalpha():
-        wordcnt = wordcnt + 1
+      if word.isalpha(): wordcnt = wordcnt + 1
   outputfile.write(textout.strip()+ '\n')
 
 inputfile.close()

From 633f21d33a53228ca870821ce6a2e5a432c4e9f6 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Fri, 1 Mar 2019 20:26:37 +0530
Subject: [PATCH 16/49] small Fix path.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/path.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
index d2c2937d81e..e622e7d5051 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
@@ -1,4 +1,4 @@
-export KALDI_ROOT=`pwd`/../../../../kaldi
+export KALDI_ROOT=`pwd`/../../../
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1

From 8d6b14d1f75c9f532ab945e1328c8d925cf21064 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Fri, 1 Mar 2019 21:17:29 +0530
Subject: [PATCH 17/49] Update clean_abbrevs_text.py

---
 .../s5_gigaword/local/clean_abbrevs_text.py                      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
index a6edc0f92c5..7d92eb9fe3a 100644
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
@@ -27,6 +27,7 @@
       else:
         textout = textout + word + " "
     else:
+      textout = textout + word + " "
       if word.isalpha(): wordcnt = wordcnt + 1
   outputfile.write(textout.strip()+ '\n')
 

From 8c9c37bad8eba62d20231dda0d34553a6ce12c1b Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Fri, 1 Mar 2019 15:54:00 +0000
Subject: [PATCH 18/49] Added sparrowhawk installation script for text
 normalisation

---
 tools/install_sparrowhawk.sh | 73 ++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100755 tools/install_sparrowhawk.sh

diff --git a/tools/install_sparrowhawk.sh b/tools/install_sparrowhawk.sh
new file mode 100755
index 00000000000..f9bbcb1b28e
--- /dev/null
+++ b/tools/install_sparrowhawk.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+export LDFLAGS="-L`pwd`/openfst/lib"
+export CXXFLAGS="-I`pwd`/openfst/include"
+stage=0
+
+if [ $stage -le 0 ] ; then
+    git clone -b feature/Spanish_normalizer https://github.com/spokencloud/sparrowhawk-resources.git || exit 1;
+    patch -p0 < sparrowhawk-resources/local/Makefile.patch || exit 1;
+    make openfst || exit 1;
+    git clone https://github.com/mjansche/thrax.git
+    export LDFLAGS=-L`pwd`/openfst/lib
+    export CXXFLAGS=-I`pwd`/openfst/include
+    cd thrax
+    autoreconf --force --install || exit 1;
+    ./configure --prefix=`pwd` || exit 1;
+    make || exit 1;
+    make install || exit 1;
+    cd ..
+    git clone https://github.com/google/re2.git || exit 1;
+    cd re2/
+    make -j 20 || exit 1;
+    make test || exit 1;
+    make install prefix=`pwd` || exit 1;
+    cd ..
+    git clone https://github.com/google/protobuf.git || exit 1;
+    cd protobuf/
+    ./autogen.sh || exit 1;
+    ./configure --prefix=`pwd` || exit 1;
+    make -j 20 || exit 1;
+    make install || exit 1;
+    cd ..
+fi
+
+if [ $stage -le 1 ]; then 
+    git clone https://github.com/google/sparrowhawk.git || exit 1;
+    patch -p0 < sparrowhawk-resources/local/sparrowhawk.patch || exit 1;
+    cd sparrowhawk/ || exit 1;
+    mkdir lib
+    mkdir bin
+    mkdir include
+    cp -r ../openfst/lib/* lib/ || exit 1;
+    cp -r ../protobuf/lib/* lib/ || exit 1;
+    cp -r ../re2/lib/* lib/ || exit 1;
+    cp -r ../thrax/lib/* lib/ || exit 1;
+    cp -r ../openfst/include/* include/ || exit 1;
+    cp -r ../protobuf/include/* include/ || exit 1;
+    cp -r ../re2/include/* include/ || exit 1;
+    cp -r ../thrax/include/* include/ || exit 1;
+    cp ../protobuf/bin/protoc bin/. || exit 1;
+    export PATH=`pwd`/bin:$PATH
+    aclocal || exit 1;
+    automake || exit 1;
+    ./configure --prefix=`pwd`  CPPFLAGS="-I`pwd`/include"  LDFLAGS="-L`pwd`/lib" || exit 1;
+    make || exit 1;
+    make install || exit 1;
+    cd ..
+fi
+
+if [ $stage -le 2 ]; then 
+    source ~/anaconda/bin/activate py27 || exit 1;
+    cp -r sparrowhawk-resources/language-resources sparrowhawk/ || exit 1;
+    cd sparrowhawk/language-resources/en/textnorm/classifier || exit 1;
+    . ./path.sh || exit 1;
+    python create_far.py ascii.syms  universal_depot_ascii universal_depot universal_depot.far 
+    thraxmakedep tokenize_and_classify.grm || exit 1;
+    make || exit 1;
+    cd ../verbalizer
+    python create_far.py ascii.syms  number_names_depot_ascii number_names_depot number_names_depot.far
+    cp -r ../classifier/universal_depot.far .
+    thraxmakedep  verbalize.grm || exit 1;
+    make || exit 1;
+    cd ../../../../..
+fi

From c6b05d18597612170148a3ad7b313dc192d62de4 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Sat, 2 Mar 2019 06:02:57 +0000
Subject: [PATCH 19/49] G2P training stage added into Spanish gigaword recipe

---
 egs/fisher_callhome_spanish/s5_gigaword/run.sh | 8 ++++++++
 tools/extras/install_g2p_seq2seq.sh            | 5 +++++
 tools/install_g2p_seq2seq.sh                   | 1 +
 3 files changed, 14 insertions(+)
 create mode 100644 tools/extras/install_g2p_seq2seq.sh
 create mode 120000 tools/install_g2p_seq2seq.sh

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index bd553fc720e..7e488cdc5fa 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -39,6 +39,9 @@ if [ $stage -le -1 ]; then
   # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
   # wordlist is downloaded if it is not available.
   local/fsp_prepare_dict.sh $spanish_lexicon
+  (
+    steps/dict/train_g2p_seq2seq.sh data/local/dict/lexicon.txt exp/g2p || touch exp/g2p/.error
+  ) &
 
   # Added c,j, v to the non silences phones manually
   utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
@@ -74,6 +77,11 @@ if [ $stage -le -1 ]; then
 
   local/create_splits.sh $split
   local/callhome_create_splits.sh $split_callhome
+  wait # wait till G2P training finishes
+  if [ -f exp/g2p/.error ]; then
+     rm exp/g2p/.error || true
+     echo "Fail to train the G2P model." && exit 1;
+  fi
 fi
 
 if [ $stage -le 0 ]; then
diff --git a/tools/extras/install_g2p_seq2seq.sh b/tools/extras/install_g2p_seq2seq.sh
new file mode 100644
index 00000000000..c9979b8b961
--- /dev/null
+++ b/tools/extras/install_g2p_seq2seq.sh
@@ -0,0 +1,5 @@
+if [ ! -e g2p-seq2seq ];then
+  git clone https://github.com/cmusphinx/g2p-seq2seq.git
+  cd g2p-seq2seq/
+  python setup.py install
+fi
diff --git a/tools/install_g2p_seq2seq.sh b/tools/install_g2p_seq2seq.sh
new file mode 120000
index 00000000000..77715305f74
--- /dev/null
+++ b/tools/install_g2p_seq2seq.sh
@@ -0,0 +1 @@
+extras/install_g2p_seq2seq.sh
\ No newline at end of file

From 8c226cc9b0995c9a656a20484587d46ed28e5fee Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Sat, 2 Mar 2019 06:06:28 +0000
Subject: [PATCH 20/49] G2P seq2seq scripts added in steps/

---
 egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh | 42 ++++++++++++++++++++++
 egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh | 39 ++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
 create mode 100644 egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh

diff --git a/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
new file mode 100644
index 00000000000..77a08c305dd
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright 2018  Govivace Inc. (Author: Valluri Saikiran)
+# Apache License 2.0
+
+# This script applies a g2p model using CMUsphinx/seq2seq.
+
+stage=0
+encoding='utf-8'
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [options] <lexicon-in> <work-dir> <outdir>"
+  echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
+  echo "    word per line, with lines like 'hello h uh l ow') and"
+  echo "    <work-dir> is directory where the models will be stored"
+  exit 1;
+fi
+
+lexicon=$1
+wdir=$2
+outdir=$3
+
+mkdir -p $outdir
+
+[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit
+
+if [ ! -s `which g2p-seq2seq` ] ; then
+  echo "g2p-seq2seq was not found !"
+  echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh"
+  exit 1
+fi
+
+g2p-seq2seq --decode $lexicon --model_dir $wdir --output $outdir/lexicon.lex
+
diff --git a/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh
new file mode 100644
index 00000000000..e0389171fd5
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/train_g2p_seq2seq.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Copyright 2018  Govivace Inc. (Author: Valluri Saikiran)
+# Apache License 2.0
+
+# This script trains a g2p model using CMUsphinx/seq2seq.
+
+stage=0
+encoding='utf-8'
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [options] <lexicon-in> <work-dir>"
+  echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
+  echo "    word per line, with lines like 'hello h uh l ow') and"
+  echo "    <work-dir> is directory where the models will be stored"
+  exit 1;
+fi
+
+lexicon=$1
+wdir=$2
+
+[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit
+
+if [ ! -s `which g2p-seq2seq` ]; then
+  echo "g2p-seq2seq was not found !"
+  echo "Go to $KALDI_ROOT/tools and execute extras/install_g2p_seq2seq.sh"
+  exit 1
+fi
+
+g2p-seq2seq   --max_epochs 12 --train $lexicon --model_dir $wdir
+

From 7b67fc2ade32fa7449a3a228903c920f499a2c3c Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Sat, 2 Mar 2019 12:09:40 +0000
Subject: [PATCH 21/49] RNNLM scripts updated to UTF8 encoding

---
 scripts/rnnlm/choose_features.py         | 12 +++++++++---
 scripts/rnnlm/get_best_model.py          | 24 +++++++++++++-----------
 scripts/rnnlm/get_embedding_dim.py       |  2 +-
 scripts/rnnlm/get_num_splits.sh          |  2 +-
 scripts/rnnlm/get_special_symbol_opts.py |  8 ++++++--
 scripts/rnnlm/get_unigram_probs.py       | 18 +++++++++++-------
 scripts/rnnlm/get_vocab.py               |  7 +++++--
 scripts/rnnlm/get_word_features.py       | 15 +++++++++------
 scripts/rnnlm/lmrescore.sh               |  6 ++++++
 scripts/rnnlm/lmrescore_nbest.sh         |  4 ++--
 scripts/rnnlm/lmrescore_pruned.sh        | 17 +++++++++++++----
 scripts/rnnlm/prepare_rnnlm_dir.sh       |  9 +++++++--
 scripts/rnnlm/prepare_split_data.py      |  5 ++++-
 scripts/rnnlm/show_word_features.py      | 13 +++++++++++--
 scripts/rnnlm/train_rnnlm.sh             |  2 +-
 scripts/rnnlm/validate_features.py       |  5 ++++-
 scripts/rnnlm/validate_text_dir.py       |  7 +++++--
 scripts/rnnlm/validate_word_features.py  |  7 +++++--
 18 files changed, 113 insertions(+), 50 deletions(-)

diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py
index 799f6b6dcc8..c6621e04494 100755
--- a/scripts/rnnlm/choose_features.py
+++ b/scripts/rnnlm/choose_features.py
@@ -10,6 +10,12 @@
 from collections import defaultdict
 sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
+# because this script splits inside words, we cannot use latin-1; we actually need to know what 
+# what the encoding is.  By default we make this utf-8; to handle encodings that are not compatible
+# with utf-8 (e.g. gbk), we'll eventually have to make the encoding an option to this script.
+
+import re
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. "
                                              "To be more specific, it chooses the set of features-- you compute "
@@ -84,9 +90,9 @@
 #  and 'wordlist' is a list indexed by integer id, that returns the string-valued word.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f:
+    with open(vocab_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = line.split()
+            fields = re.split(tab_or_space, line)
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -115,7 +121,7 @@ def read_unigram_probs(unigram_probs_file):
     unigram_probs = []
     with open(unigram_probs_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = line.split()
+            fields = re.split(tab_or_space, line)
             assert len(fields) == 2
             idx = int(fields[0])
             if idx >= len(unigram_probs):
diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py
index 45487b18b0c..ed266346e06 100755
--- a/scripts/rnnlm/get_best_model.py
+++ b/scripts/rnnlm/get_best_model.py
@@ -3,14 +3,14 @@
 # Copyright  2017  Johns Hopkins University (author: Daniel Povey)
 # License: Apache 2.0.
 
-import os
 import argparse
-import sys
+import glob
 import re
+import sys
 
 parser = argparse.ArgumentParser(description="Works out the best iteration of RNNLM training "
-                                 "based on dev-set perplexity, and prints the number corresponding "
-                                 "to that iteration",
+                                             "based on dev-set perplexity, and prints the number corresponding "
+                                             "to that iteration",
                                  epilog="E.g. " + sys.argv[0] + " exp/rnnlm_a",
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
@@ -19,8 +19,7 @@
 
 args = parser.parse_args()
 
-
-num_iters=None
+num_iters = None
 try:
     with open(args.rnnlm_dir + "/info.txt", encoding="utf-8") as f:
         for line in f:
@@ -36,15 +35,15 @@
     sys.exit(sys.argv[0] + ": could not get num_iters from {0}/info.txt".format(
         args.rnnlm_dir))
 
-best_objf=-2000
-best_iter=-1
-for i in range(num_iters):
+best_objf = -2000
+best_iter = -1
+for i in range(1, num_iters):
     this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i)
     try:
         f = open(this_logfile, 'r', encoding='utf-8')
     except:
         sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile))
-    this_objf=-1000
+    this_objf = -1000
     for line in f:
         m = re.search('Overall objf .* (\S+)$', str(line))
         if m is not None:
@@ -53,6 +52,10 @@
             except Exception as e:
                 sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format(
                     this_logfile, line, str(e)))
+    # verify this iteration still has model files present
+    if len(glob.glob("{0}/{1}.raw".format(args.rnnlm_dir, i))) == 0:
+        # this iteration has log files, but model files have been cleaned up, skip it
+        continue
     if this_objf == -1000:
         print(sys.argv[0] + ": warning: could not parse objective function from {0}".format(
             this_logfile), file=sys.stderr)
@@ -63,5 +66,4 @@
 if best_iter == -1:
     sys.exit(sys.argv[0] + ": error: could not get best iteration.")
 
-
 print(str(best_iter))
diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py
index b6810ef2cbf..1d516e0edf5 100755
--- a/scripts/rnnlm/get_embedding_dim.py
+++ b/scripts/rnnlm/get_embedding_dim.py
@@ -101,4 +101,4 @@
              "nnet '{0}': {1} != {2}".format(
             args.nnet, input_dim, output_dim))
 
-print(str(input_dim))
+print('{}'.format(input_dim))
diff --git a/scripts/rnnlm/get_num_splits.sh b/scripts/rnnlm/get_num_splits.sh
index 93d1f7f169c..974fd8bf204 100755
--- a/scripts/rnnlm/get_num_splits.sh
+++ b/scripts/rnnlm/get_num_splits.sh
@@ -65,7 +65,7 @@ tot_with_multiplicities=0
 
 for f in $text/*.counts; do
   if [ "$f" != "$text/dev.counts" ]; then
-    this_tot=$(cat $f | awk '{tot += $2} END{print tot}')
+    this_tot=$(cat $f | awk '{tot += $2} END{printf("%d", tot)}')
     if ! [ $this_tot -gt 0 ]; then
       echo "$0: there were no counts in counts file $f" 1>&2
       exit 1
diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py
index 13fe497faf9..0cf8e10feca 100755
--- a/scripts/rnnlm/get_special_symbol_opts.py
+++ b/scripts/rnnlm/get_special_symbol_opts.py
@@ -8,6 +8,9 @@
 import argparse
 import sys
 
+import re
+tab_or_space = re.compile('[ \t]+')
+
 parser = argparse.ArgumentParser(description="This script checks whether the special symbols "
                                  "appear in words.txt with expected values, if not, it will "
                                  "print out the options with correct value to stdout, which may look like "
@@ -25,9 +28,10 @@
 
 lower_ids = {}
 upper_ids = {}
-input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace')
+input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
 for line in input_stream:
-    fields = line.split()
+    fields = re.split(tab_or_space, line)
+    assert(len(fields) == 2)
     sym = fields[0]
     if sym in special_symbols:
         assert sym not in lower_ids
diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py
index 32b01728ca3..d115b6f54bf 100755
--- a/scripts/rnnlm/get_unigram_probs.py
+++ b/scripts/rnnlm/get_unigram_probs.py
@@ -7,6 +7,9 @@
 import argparse
 import sys
 
+import re
+tab_or_space = re.compile('[ \t]+')
+
 parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.",
                                  epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt "
                                         "--data-weights-file=exp/rnnlm/data_weights.txt data/rnnlm/data "
@@ -74,10 +77,10 @@ def get_all_data_sources_except_dev(text_dir):
 #                    value is a tuple (repeated_times_per_epoch, weight)
 def read_data_weights(weights_file, data_sources):
     data_weights = {}
-    with open(weights_file, 'r', encoding="utf-8", errors='replace') as f:
+    with open(weights_file, 'r', encoding="utf-8") as f:
         for line in f:
             try:
-                fields = line.split()
+                fields = re.split(tab_or_space, line)
                 assert len(fields) == 3
                 if fields[0] in data_weights:
                     raise Exception("duplicated data source({0}) specified in "
@@ -99,9 +102,9 @@ def read_data_weights(weights_file, data_sources):
 # return the vocab, which is a dict mapping the word to a integer id.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f:
+    with open(vocab_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = line.split()
+            fields = re.split(tab_or_space, line)
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -128,10 +131,11 @@ def get_counts(data_sources, data_weights, vocab):
         if weight == 0.0:
             continue
 
-        with open(counts_file, 'r', encoding="utf-8", errors='replace') as f:
+        with open(counts_file, 'r', encoding="utf-8") as f:
             for line in f:
-                fields = line.split()
-                assert len(fields) == 2
+                fields = re.split(tab_or_space, line)
+                if len(fields) != 2: print("Warning, should be 2 cols:", fields, line, file=sys.stderr);
+                assert(len(fields) == 2)
                 word = fields[0]
                 count = fields[1]
                 if word not in vocab:
diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py
index f290ef721c1..d65f8e3669b 100755
--- a/scripts/rnnlm/get_vocab.py
+++ b/scripts/rnnlm/get_vocab.py
@@ -8,6 +8,9 @@
 import sys
 sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
+import re
+tab_or_space = re.compile('[ \t]+')
+
 parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts "
                                  "of words produced by get_unigram_counts.sh",
                                  epilog="E.g. " + sys.argv[0] + " data/rnnlm/data > data/rnnlm/vocab/words.txt",
@@ -27,8 +30,8 @@
 def add_counts(word_counts, counts_file):
     with open(counts_file, 'r', encoding="utf-8") as f:
         for line in f:
-            line = line.strip()
-            word_and_count = line.split()
+            line = line.strip(" \t\r\n")
+            word_and_count = re.split(tab_or_space, line)
             assert len(word_and_count) == 2
             if word_and_count[0] in word_counts:
                 word_counts[word_and_count[0]] += int(word_and_count[1])
diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py
index 8bdb553b9c8..7555b774b83 100755
--- a/scripts/rnnlm/get_word_features.py
+++ b/scripts/rnnlm/get_word_features.py
@@ -9,6 +9,9 @@
 import math
 from collections import defaultdict
 
+import re
+tab_or_space = re.compile('[ \t]+')
+
 parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, "
                                              "using features from rnnlm/choose_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " --unigram-probs=exp/rnnlm/unigram_probs.txt "
@@ -38,9 +41,9 @@
 # return the vocab, which is a dict mapping the word to a integer id.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="utf-8", errors='replace') as f:
+    with open(vocab_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = line.split()
+            fields = re.split(tab_or_space, line)
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -59,9 +62,9 @@ def read_vocab(vocab_file):
 # return a list of unigram_probs, indexed by word id
 def read_unigram_probs(unigram_probs_file):
     unigram_probs = []
-    with open(unigram_probs_file, 'r', encoding="utf-8", errors='replace') as f:
+    with open(unigram_probs_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = line.split()
+            fields = re.split(tab_or_space, line)
             assert len(fields) == 2
             idx = int(fields[0])
             if idx >= len(unigram_probs):
@@ -100,9 +103,9 @@ def read_features(features_file):
     feats['min_ngram_order'] = 10000
     feats['max_ngram_order'] = -1
 
-    with open(features_file, 'r', encoding="utf-8", errors='replace') as f:
+    with open(features_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = line.split()
+            fields = re.split(tab_or_space, line)
             assert(len(fields) in [3, 4, 5])
 
             feat_id = int(fields[0])
diff --git a/scripts/rnnlm/lmrescore.sh b/scripts/rnnlm/lmrescore.sh
index cd0cf793d8d..9da22ae75a2 100755
--- a/scripts/rnnlm/lmrescore.sh
+++ b/scripts/rnnlm/lmrescore.sh
@@ -72,6 +72,12 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
   print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
   || exit 1;
 
+if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then
+  # the last word of the RNNLM word list is an added <brk> word
+  echo "$0: Word lists mismatch for lattices and RNNLM."
+  exit 1
+fi
+
 oldlm_command="fstproject --project_output=true $oldlm |"
 
 special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt)
diff --git a/scripts/rnnlm/lmrescore_nbest.sh b/scripts/rnnlm/lmrescore_nbest.sh
index f50a3c909f0..58b19b9fa79 100755
--- a/scripts/rnnlm/lmrescore_nbest.sh
+++ b/scripts/rnnlm/lmrescore_nbest.sh
@@ -29,7 +29,7 @@ if [ $# != 6 ]; then
    echo "This version applies an RNNLM and mixes it with the LM scores"
    echo "previously in the lattices., controlled by the first parameter (rnnlm-weight)"
    echo ""
-   echo "Usage: utils/rnnlmrescore.sh <rnn-weight> <old-lang-dir> <rnn-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo "Usage: $0 [options] <rnn-weight> <old-lang-dir> <rnn-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
    echo "Main options:"
    echo "  --inv-acwt <inv-acwt>          # default 12.  e.g. --inv-acwt 17.  Equivalent to LM scale to use."
    echo "                                 # for N-best list generation... note, we'll score at different acwt's"
@@ -177,7 +177,7 @@ fi
 if [ $stage -le 6 ]; then
   echo "$0: invoking rnnlm/compute_sentence_scores.sh which calls rnnlm to get RNN LM scores."
   $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \
-    local/rnnlm/compute_sentence_scores.sh $rnndir $adir.JOB/temp \
+    rnnlm/compute_sentence_scores.sh $rnndir $adir.JOB/temp \
                                    $adir.JOB/words_text $adir.JOB/lmwt.rnn 
 fi
 if [ $stage -le 7 ]; then
diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh
index 46ee5846424..9ba78415708 100755
--- a/scripts/rnnlm/lmrescore_pruned.sh
+++ b/scripts/rnnlm/lmrescore_pruned.sh
@@ -16,16 +16,18 @@ max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-
                   # the same ngram history and this prevents the lattice from 
                   # exploding exponentially. Details of the n-gram approximation
                   # method are described in section 2.3 of the paper
-                  # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdm
-max_arcs=499         # limit the max arcs in lattice while rescoring. E.g., 20000
+                  # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf
+max_arcs=         # limit the max arcs in lattice while rescoring. E.g., 20000
 
-acwt=1
-weight=1  # Interpolation weight for RNNLM.
+acwt=0.1
+weight=0.5  # Interpolation weight for RNNLM.
 normalize=false # If true, we add a normalization step to the output of the RNNLM
                 # so that it adds up to *exactly* 1. Note that this is not necessary
                 # as in our RNNLM setup, a properly trained network would automatically
                 # have its normalization term close to 1. The details of this
                 # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf
+lattice_prune_beam=4 # Beam used in pruned lattice composition
+                     # This option affects speed and how large the composed lattice may be
 
 # End configuration section.
 
@@ -73,6 +75,12 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
   print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
   || exit 1;
 
+if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then
+  # the last word of the RNNLM word list is an added <brk> word
+  echo "$0: Word lists mismatch for lattices and RNNLM."
+  exit 1
+fi
+
 normalize_opt=
 if $normalize; then
   normalize_opt="--normalize-probs=true"
@@ -97,6 +105,7 @@ cp $indir/num_jobs $outdir
 
 $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
   lattice-lmrescore-kaldi-rnnlm-pruned --lm-scale=$weight $special_symbol_opts \
+    --lattice-compose-beam=$lattice_prune_beam \
     --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt $max_arcs_opt \
     $carpa_option $oldlm $word_embedding "$rnnlm_dir/final.raw" \
     "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh
index d3ee44f1f95..e101822d983 100755
--- a/scripts/rnnlm/prepare_rnnlm_dir.sh
+++ b/scripts/rnnlm/prepare_rnnlm_dir.sh
@@ -23,7 +23,7 @@ if [ $# != 3 ]; then
   echo "Usage: $0 [options] <text-dir> <rnnlm-config-dir> <rnnlm-dir>"
   echo "Sets up the directory <rnnlm-dir> for RNNLM training as done by"
   echo "rnnlm/train_rnnlm.sh, and initializes the model."
-  echo " <text-dir> is as validated by rnnlm/validate_data_dir.py"
+  echo " <text-dir> is as validated by rnnlm/validate_text_dir.py"
   echo " <rnnlm-config-dir> is as validated by rnnlm/validate_config_dir.sh."
   exit 1
 fi
@@ -34,6 +34,7 @@ config_dir=$2
 dir=$3
 
 set -e
+. ./path.sh
 
 if [ $stage -le 0 ]; then
   echo "$0: validating input"
@@ -52,9 +53,13 @@ if [ $stage -le 1 ]; then
     echo "$0: copying config directory"
     mkdir -p $dir/config
     # copy expected things from $config_dir to $dir/config.
-    for f in words.txt features.txt data_weights.txt oov.txt xconfig; do
+    for f in words.txt data_weights.txt oov.txt xconfig; do
       cp $config_dir/$f $dir/config
     done
+    # features.txt is optional, check separately
+    if [ -f $config_dir/features.txt ]; then
+      cp $config_dir/features.txt $dir/config
+    fi
   fi
 
   rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt
diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py
index 9cc4f69d09f..adcb164771d 100755
--- a/scripts/rnnlm/prepare_split_data.py
+++ b/scripts/rnnlm/prepare_split_data.py
@@ -8,6 +8,9 @@
 import argparse
 import sys
 
+import re
+tab_or_space = re.compile('[ \t]+')
+
 parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, "
                                  "for consumption by nnet3-get-egs.",
                                  epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt "
@@ -66,7 +69,7 @@ def read_data_weights(weights_file, data_sources):
     with open(weights_file, 'r', encoding="utf-8") as f:
         for line in f:
             try:
-                fields = line.split()
+                fields = re.split(tab_or_space, line)
                 assert len(fields) == 3
                 if fields[0] in data_weights:
                     raise Exception("duplicated data source({0}) specified in "
diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py
index 89d84d53f3e..8b69fbb7d8a 100755
--- a/scripts/rnnlm/show_word_features.py
+++ b/scripts/rnnlm/show_word_features.py
@@ -6,8 +6,17 @@
 import os
 import argparse
 import sys
+
+# The use of latin-1 encoding does not preclude reading utf-8.  latin-1 encoding
+# means "treat words as sequences of bytes", and it is compatible with utf-8
+# encoding as well as other encodings such as gbk, as long as the spaces are
+# also spaces in ascii (which we check).  It is basically how we emulate the
+# behavior of python before python3.
 sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
+import re
+tab_or_space = re.compile('[ \t]+')
+
 parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.",
                                  epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt "
                                         "> exp/rnnlm/word_feats.str.txt",
@@ -29,7 +38,7 @@ def read_feature_type_and_key(features_file):
 
     with open(features_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = line.split()
+            fields = re.split(tab_or_space, line)
             assert(len(fields) in [2, 3, 4])
 
             feat_id = int(fields[0])
@@ -46,7 +55,7 @@ def read_feature_type_and_key(features_file):
 num_word_feats = 0
 with open(args.word_features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = line.split()
+        fields = re.split(tab_or_space, line)
         assert len(fields) % 2 == 1
 
         print(int(fields[0]), end='\t')
diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh
index f056d096120..013e9a56c2f 100755
--- a/scripts/rnnlm/train_rnnlm.sh
+++ b/scripts/rnnlm/train_rnnlm.sh
@@ -41,7 +41,7 @@ use_gpu_for_diagnostics=false  # set true to use GPU for compute_prob_*.log
 # optional cleanup options
 cleanup=false  # add option --cleanup true to enable automatic cleanup of old models
 cleanup_strategy="keep_latest"  # determines cleanup strategy, use either "keep_latest" or "keep_best"
-cleanup_keep_iters=100  # number of iterations that will have their models retained
+cleanup_keep_iters=3  # number of iterations that will have their models retained
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 . utils/parse_options.sh
diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py
index a650092b086..939e634592c 100755
--- a/scripts/rnnlm/validate_features.py
+++ b/scripts/rnnlm/validate_features.py
@@ -7,6 +7,9 @@
 import argparse
 import sys
 
+import re
+tab_or_space = re.compile('[ \t]+')
+
 parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt",
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -30,7 +33,7 @@
     final_feats = {}
     word_feats = {}
     for line in f:
-        fields = line.split()
+        fields = re.split(tab_or_space, line)
         assert(len(fields) in [3, 4, 5])
 
         assert idx == int(fields[0])
diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py
index d644d77911e..61914e4836a 100755
--- a/scripts/rnnlm/validate_text_dir.py
+++ b/scripts/rnnlm/validate_text_dir.py
@@ -7,6 +7,9 @@
 import argparse
 import sys
 
+import re
+tab_or_space = re.compile('[ \t]+')
+
 parser = argparse.ArgumentParser(description="Validates data directory containing text "
                                  "files from one or more data sources, including dev.txt.",
                                  epilog="E.g. " + sys.argv[0] + " data/rnnlm/data",
@@ -51,7 +54,7 @@ def check_text_file(text_file):
             lineno += 1
             if args.spot_check == 'true' and lineno > 10:
                 break
-            words = line.split()
+            words = re.split(tab_or_space, line)
             if len(words) != 0:
                 found_nonempty_line = True
                 for word in words:
@@ -75,7 +78,7 @@ def check_text_file(text_file):
     other_fields_set = set()
     with open(text_file, 'r', encoding="utf-8") as f:
         for line in f:
-            array = line.split()
+            array = re.split(tab_or_space, line)
             if len(array) > 0:
                 first_word = array[0]
                 if first_word in first_field_set or first_word in other_fields_set:
diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py
index 3dc9b23aa41..303daf28bb1 100755
--- a/scripts/rnnlm/validate_word_features.py
+++ b/scripts/rnnlm/validate_word_features.py
@@ -7,6 +7,9 @@
 import argparse
 import sys
 
+import re
+tab_or_space = re.compile('[ \t]+')
+
 parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt "
                                         "exp/rnnlm/word_feats.txt",
@@ -27,7 +30,7 @@
 max_feat_id = -1
 with open(args.features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = line.split()
+        fields = re.split(tab_or_space, line)
         assert(len(fields) in [3, 4, 5])
 
         feat_id = int(fields[0])
@@ -51,7 +54,7 @@
 
 with open(args.word_features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = line.split()
+        fields = re.split(tab_or_space, line)
         assert len(fields) > 0 and len(fields) % 2 == 1
         word_id = int(fields[0])
 

From 4767c7ce0aef8db9d2e4bdd708773fc84ef1cf0b Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Fri, 8 Mar 2019 22:03:50 +0530
Subject: [PATCH 22/49] Update pocolm_cust.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
index 422db15937a..0e71be29119 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
@@ -13,7 +13,7 @@ export PATH=$PATH:$POCOLM_ROOT/scripts
 
 wordlist=None
 num_word=100000
-pocolm_stage=2
+pocolm_stage=1
 ngram_order=3
 lm_dir=
 arpa_dir=

From 2cd5948302c2f4c787a28d7fc96b700af8f525c3 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Fri, 8 Mar 2019 22:04:58 +0530
Subject: [PATCH 23/49] Update run.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 7e488cdc5fa..b63b5208138 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -6,6 +6,7 @@
 
 stage=-1
 lmstage=-2
+num_words_pocolm=110000
 train_sgmm2=false
 
 # call the next line with the directory where the Spanish Fisher data is
@@ -96,7 +97,6 @@ if [ $stage -le 0 ]; then
     cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
 fi
 
-num_words_pocolm=110000
 if [ $stage -le 1 ]; then
     local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
     local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \

From 6595b429f3e743f779f8ef7f3e9f605bb6bd8105 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Mon, 18 Mar 2019 14:56:40 +0000
Subject: [PATCH 24/49] Added steps for generating POCOLM ARPA file

---
 .../s5_gigaword/local/train_pocolm.sh                      | 7 +++++--
 egs/fisher_callhome_spanish/s5_gigaword/run.sh             | 4 ++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
index 964dd3bbcc5..b8b3ca35ef9 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
@@ -43,8 +43,11 @@ if [ $stage -le -1 ];then
     python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights
     bash local/pocolm_cust.sh  --num-word "$num_words_pocolm"  --lm-dir "$pocolm_dir"/lm \
 	                       --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
-    
-
+    prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \
+                        "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"
+    mkdir -p "$pocolm_dir"/arpa
+    format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"  | \
+                                gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz    
 fi
 
 
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index b63b5208138..1ad8f9f1e0b 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -6,6 +6,7 @@
 
 stage=-1
 lmstage=-2
+addtraintext=true
 num_words_pocolm=110000
 train_sgmm2=false
 
@@ -95,6 +96,9 @@ if [ $stage -le 0 ]; then
     cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt
     cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt  # For RNNLM and POCOLM training we use dev2/text as dev file.
     cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+    if $addtraintext; then
+        cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+    fi
 fi
 
 if [ $stage -le 1 ]; then

From 0902c9e02c139cbf41d6d5c944957ee46a1bca6d Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Sun, 24 Mar 2019 10:52:19 +0530
Subject: [PATCH 25/49] Update run.sh

---
 egs/fisher_callhome_spanish/s5_gigaword/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 1ad8f9f1e0b..970a058a07f 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -32,9 +32,9 @@ if [ -f path.sh ]; then . ./path.sh; fi
 set -eou pipefail
 
 if [ $stage -le -1 ]; then
-#  local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
+  local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
 
-#  local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
+  local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
 
   # The lexicon is created using the LDC spanish lexicon, the words from the
   # fisher spanish corpus. Additional (most frequent) words are added from the

From c10b0fe6d3a8be4e75fb31477acca179265c2ca4 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Sun, 24 Mar 2019 06:44:49 +0000
Subject: [PATCH 26/49] Apply g2p part added to get extended lexicon

---
 .../s5_gigaword/local/get_rnnlm_wordlist.py   | 16 ++---
 .../s5_gigaword/run.sh                        | 60 +++++++++++--------
 egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh    |  7 +--
 3 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py
index d6ddfbecc14..fc13a7af701 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py
@@ -1,17 +1,18 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 #    2018  Saikiran Valluri, GoVivace inc.
 
 import os, sys
 
-if len(sys.argv) < 4:
-    print( "Usage: python get_rnnlm_wordlist.py <ASR lexicon words> <POCOLM wordslist> <RNNLM wordslist output>")
+if len(sys.argv) < 5:
+    print( "Usage: python get_rnnlm_wordlist.py <ASR lexicon words> <POCOLM wordslist> <RNNLM wordslist output> <OOV wordlist>")
     sys.exit()
 
-lexicon_words = open(sys.argv[1], 'r')
-pocolm_words = open(sys.argv[2], 'r')
-rnnlm_wordsout = open(sys.argv[3], 'w')
+lexicon_words = open(sys.argv[1], 'r', encoding="utf-8")
+pocolm_words = open(sys.argv[2], 'r', encoding="utf-8")
+rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8")
+oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8")
 
 line_count=0
 lexicon=[]
@@ -23,10 +24,11 @@
 
 for line in pocolm_words:
     if not line.split()[0] in lexicon:
+        oov_wordlist.write(line.split()[0]+'\n')
         rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
         line_count = line_count + 1
 
 lexicon_words.close()
 pocolm_words.close()
 rnnlm_wordsout.close()
-        
+oov_wordlist.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 1ad8f9f1e0b..4abd34096ef 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -6,6 +6,7 @@
 
 stage=-1
 lmstage=-2
+train_rnnlm=true
 addtraintext=true
 num_words_pocolm=110000
 train_sgmm2=false
@@ -32,31 +33,23 @@ if [ -f path.sh ]; then . ./path.sh; fi
 set -eou pipefail
 
 if [ $stage -le -1 ]; then
-#  local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
+  local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
 
-#  local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
+  local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
 
   # The lexicon is created using the LDC spanish lexicon, the words from the
   # fisher spanish corpus. Additional (most frequent) words are added from the
   # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
   # wordlist is downloaded if it is not available.
   local/fsp_prepare_dict.sh $spanish_lexicon
+  # Let's keep the original dict copy for G2P training
+  cp -r data/local/dict data/local/dict_orig
   (
-    steps/dict/train_g2p_seq2seq.sh data/local/dict/lexicon.txt exp/g2p || touch exp/g2p/.error
+    steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
   ) &
 
   # Added c,j, v to the non silences phones manually
-  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
-
-  # Make sure that you do not use your test and your dev sets to train the LM
-  # Some form of cross validation is possible where you decode your dev/set based on an
-  # LM that is trained on  everything but that that conversation
-  # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
-  # to get the numbers. Depending on your needs, you might have to change the size of
-  # the splits within that file. The default paritions are based on the Kaldi + Joshua
-  # requirements which means that I have very large dev and test sets
-  local/fsp_train_lms.sh $split
-  local/fsp_create_test_lang.sh
+  utils/prepare_lang.sh data/local/dict_orig "<unk>" data/local/lang_orig data/lang_orig
 
   utils/fix_data_dir.sh data/local/data/train_all
 
@@ -79,11 +72,7 @@ if [ $stage -le -1 ]; then
 
   local/create_splits.sh $split
   local/callhome_create_splits.sh $split_callhome
-  wait # wait till G2P training finishes
-  if [ -f exp/g2p/.error ]; then
-     rm exp/g2p/.error || true
-     echo "Fail to train the G2P model." && exit 1;
-  fi
+  
 fi
 
 if [ $stage -le 0 ]; then
@@ -103,16 +92,37 @@ fi
 
 if [ $stage -le 1 ]; then
     local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
-    local/get_rnnlm_wordlist.py data/lang/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
-				"$rnnlm_workdir"/rnnlm_wordlist
-fi
-    
-if [ $stage -le 2 ]; then
-    local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
+    local/get_rnnlm_wordlist.py data/local/dict/lexicon.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
+				"$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords
+    if $train_rnnlm; then
+        local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
 		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm
+    fi
 fi
 
+
 if [ $stage -le 2 ]; then
+  wait # wait till G2P training finishes
+  if [ -f exp/g2p/.error ]; then
+     rm exp/g2p/.error || true
+     echo "Fail to train the G2P model." && exit 1;
+  fi
+  steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex
+  cat "$rnnlm_workdir"/oov_g2p.lex data/local/dict/lexicon.txt | sort -u > "$rnnlm_workdir"/lexicon_extended.txt
+  cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version.
+ 
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+  # Make sure that you do not use your test and your dev sets to train the LM
+  # Some form of cross validation is possible where you decode your dev/set based on an
+  # LM that is trained on  everything but that that conversation
+  # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
+  # to get the numbers. Depending on your needs, you might have to change the size of
+  # the splits within that file. The default paritions are based on the Kaldi + Joshua
+  # requirements which means that I have very large dev and test sets
+  local/fsp_train_lms.sh $split
+  local/fsp_create_test_lang.sh
+
   # Now compute CMVN stats for the train, dev and test subsets
   steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
   steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
diff --git a/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
index 77a08c305dd..e6e316ec6b1 100644
--- a/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
+++ b/egs/wsj/s5/steps/dict/apply_g2p_seq2seq.sh
@@ -17,10 +17,9 @@ set -u
 set -e
 
 if [ $# != 3 ]; then
-  echo "Usage: $0 [options] <lexicon-in> <work-dir> <outdir>"
-  echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
-  echo "    word per line, with lines like 'hello h uh l ow') and"
-  echo "    <work-dir> is directory where the models will be stored"
+  echo "Usage: $0 [options] <oovlist-in> <model-dir> <outdir>"
+  echo "    where <lexicon-in> is the OOV wordlist "
+  echo "    <model-dir> is directory where the models will be stored"
   exit 1;
 fi
 

From 3df45aec1d8f8a031eb8665c5c94e6be27e81803 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Sun, 24 Mar 2019 07:49:08 +0000
Subject: [PATCH 27/49] Small fix in run.sh rnnlm_wordlist

---
 egs/fisher_callhome_spanish/s5_gigaword/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 4abd34096ef..9d332cf06de 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -92,7 +92,7 @@ fi
 
 if [ $stage -le 1 ]; then
     local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
-    local/get_rnnlm_wordlist.py data/local/dict/lexicon.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
+    local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
 				"$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords
     if $train_rnnlm; then
         local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \

From 7e47695e793c113c385398dafb32f92572aec6f7 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Mon, 25 Mar 2019 06:28:24 +0000
Subject: [PATCH 28/49] Added sanity chack for Sparrowhawk normalizer in
 cleanup script

---
 .../s5_gigaword/local/clean_txt_dir.sh                    | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
index 60269c0ab7e..1880b3a90cb 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
@@ -17,6 +17,12 @@ if [ $# -ne 2 ]; then
     exit 1;
 fi
 
+if [ ! -s `which normalizer_main` ] ; then
+  echo "Sparrowhawk normalizer was not found installed !"
+  echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!"
+  exit 1
+fi
+
 txtdir=$1
 textdir=$(realpath $txtdir)
 outdir=$(realpath $2)
@@ -38,7 +44,7 @@ if [ $stage -le 0 ]; then
   $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
     local/run_norm.sh \
     sparrowhawk_configuration.ascii_proto \
-    $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \
+    $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \
     $outdir/data \
     JOB \
     $outdir/sparrowhawk/

From 91a4611bba540c907b223c39b658bc5baca3a80f Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Mon, 25 Mar 2019 07:10:49 +0000
Subject: [PATCH 29/49] Data preparation fixes

---
 .../s5_gigaword/local/chain/run_tdnn_1g.sh                | 7 ++++++-
 .../s5_gigaword/local/fsp_data_prep.sh                    | 1 +
 egs/fisher_callhome_spanish/s5_gigaword/run.sh            | 8 +++++---
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
index c487f1bd222..08e378cf8c5 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
@@ -27,9 +27,10 @@ nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
 affix=1g   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 common_egs_dir=
 reporting_email=
+gigaword_workdir=
 
 # LSTM/chain options
-train_stage=-10
+train_stage=-20
 xent_regularize=0.1
 dropout_schedule='0,0@0.20,0.3@0.50,0'
 
@@ -277,6 +278,10 @@ if [ $stage -le 23 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
       done
+      if [ $gigaword_workdir ]; then
+        bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \
+              ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1;
+      fi
       bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
 	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
     ) || touch $dir/.error &
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh
index 11d65da3e95..22b98a6c9db 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh
@@ -133,6 +133,7 @@ if [ $stage -le 2 ]; then
   sed 's:</b::g' | \
   sed 's:<foreign langengullís>::g' | \
   sed 's:foreign>::g' | \
+  sed 's:\[noise\]:[noise] :g' | \
   sed 's:>::g' | \
   #How do you handle numbers?
   grep -v '()' | \
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 9d332cf06de..687fcfdf3c1 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -23,7 +23,7 @@ callhome_transcripts=/export/corpora/LDC/LDC96T17
 split_callhome=local/splits/split_callhome
 
 gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data
-rnnlm_workdir=/export/c03/svalluri/workdir_pocolm_2stage
+rnnlm_workdir=workdir_rnnlm_Spanish_08032019
 mfccdir=`pwd`/mfcc
 
 . ./cmd.sh
@@ -75,6 +75,7 @@ if [ $stage -le -1 ]; then
   
 fi
 
+
 if [ $stage -le 0 ]; then
     mkdir -p "$rnnlm_workdir"/gigaword_rawtext
     local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath"  "$rnnlm_workdir"/flattened_gigaword_corpus 24
@@ -90,6 +91,7 @@ if [ $stage -le 0 ]; then
     fi
 fi
 
+
 if [ $stage -le 1 ]; then
     local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
     local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
@@ -108,7 +110,7 @@ if [ $stage -le 2 ]; then
      echo "Fail to train the G2P model." && exit 1;
   fi
   steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex
-  cat "$rnnlm_workdir"/oov_g2p.lex data/local/dict/lexicon.txt | sort -u > "$rnnlm_workdir"/lexicon_extended.txt
+  cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sort | uniq | sed "/^$/d"  > "$rnnlm_workdir"/lexicon_extended.txt
   cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version.
  
   utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
@@ -294,6 +296,6 @@ fi
 wait;
 
 if [ $stage -le 6 ]; then
-  local/chain/run_tdnn_1g.sh || exit 1;
+  local/chain/run_tdnn_1g.sh --gigaword-workdir $rnnlm_workdir || exit 1;
 fi
 exit 0;

From 5f45dd17453dc3eb2424b35d78e1ed3eb20a5a2c Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Tue, 26 Mar 2019 08:02:39 -0400
Subject: [PATCH 30/49] Cosmetic options for gigaword textclean

---
 .../s5_gigaword/path.sh                       |  6 +++--
 .../s5_gigaword/run.sh                        | 23 +++++++++++--------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
index e622e7d5051..2993311fd90 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
@@ -7,5 +7,7 @@ export LD_LIBRARY_PATH=/home/dpovey/libs
 
 export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
 export PATH=$SPARROWHAWK_ROOT/bin:$PATH
-export LC_ALL=C.UTF-8
-export LANG=C.UTF-8
+export LC_ALL=C
+export LANG=C
+
+source ~/anaconda/bin/activate py36
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index 687fcfdf3c1..e1c43d24902 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -6,7 +6,8 @@
 
 stage=-1
 lmstage=-2
-train_rnnlm=true
+train_rnnlm=false
+start_textcleanup=false
 addtraintext=true
 num_words_pocolm=110000
 train_sgmm2=false
@@ -14,7 +15,7 @@ train_sgmm2=false
 # call the next line with the directory where the Spanish Fisher data is
 # (the values below are just an example).
 sfisher_speech=/export/corpora/LDC/LDC2010S01
-sfisher_transcripts=/export/corpora/LDC/LDC2010T04
+sfisher_transcripts=/export/c03/svalluri//LDC2010T04
 spanish_lexicon=/export/corpora/LDC/LDC96L16
 split=local/splits/split_fisher
 
@@ -44,9 +45,9 @@ if [ $stage -le -1 ]; then
   local/fsp_prepare_dict.sh $spanish_lexicon
   # Let's keep the original dict copy for G2P training
   cp -r data/local/dict data/local/dict_orig
-  (
-    steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
-  ) &
+#  (
+#    steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
+#  ) &
 
   # Added c,j, v to the non silences phones manually
   utils/prepare_lang.sh data/local/dict_orig "<unk>" data/local/lang_orig data/lang_orig
@@ -75,8 +76,12 @@ if [ $stage -le -1 ]; then
   
 fi
 
+if $start_textcleanup; then
+  echo "WARNING : Starting from cleaning up and normalizing the Gigword text"
+  echo "          This might take few days........... You can opt out this stage "
+  echo "          by setting start_textcleanup=false, and having text_lm ready inside rnnlm_workdir."
 
-if [ $stage -le 0 ]; then
+  if [ $stage -le 0 ]; then
     mkdir -p "$rnnlm_workdir"/gigaword_rawtext
     local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath"  "$rnnlm_workdir"/flattened_gigaword_corpus 24
     cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt
@@ -89,9 +94,9 @@ if [ $stage -le 0 ]; then
     if $addtraintext; then
         cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
     fi
+  fi
 fi
 
-
 if [ $stage -le 1 ]; then
     local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
     local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
@@ -110,7 +115,7 @@ if [ $stage -le 2 ]; then
      echo "Fail to train the G2P model." && exit 1;
   fi
   steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex
-  cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sort | uniq | sed "/^$/d"  > "$rnnlm_workdir"/lexicon_extended.txt
+  cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^$/d" |sort | uniq  > "$rnnlm_workdir"/lexicon_extended.txt
   cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version.
  
   utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
@@ -296,6 +301,6 @@ fi
 wait;
 
 if [ $stage -le 6 ]; then
-  local/chain/run_tdnn_1g.sh --gigaword-workdir $rnnlm_workdir || exit 1;
+  local/chain/run_tdnn_1g.sh --stage 9 --gigaword-workdir $rnnlm_workdir || exit 1;
 fi
 exit 0;

From e711d30f7bb77c3c5fa1e766de1896d1559bd3a1 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Mon, 1 Apr 2019 07:16:17 -0400
Subject: [PATCH 31/49] Some fixes in rnnlm training

---
 .../s5_gigaword/local/chain/run_tdnn_1g.sh       |  9 +++++----
 egs/fisher_callhome_spanish/s5_gigaword/run.sh   | 16 ++++++++++------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
index 08e378cf8c5..2f478419a18 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
@@ -202,7 +202,7 @@ fi
 
 
 if [ $stage -le 20 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+  if [[ $(hostname -f) == *.clsp.joujhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
@@ -255,9 +255,10 @@ if [ $stage -le 21 ]; then
 
 fi
 
+# Let's train first a small RNNLM on Fisher train set
 rnnlmdir=exp/rnnlm_lstm_tdnn_1b
 if [ $stage -le 22 ]; then
-  local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
+  rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
 fi
 
 if [ $stage -le 23 ]; then
@@ -279,10 +280,10 @@ if [ $stage -le 23 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
       done
       if [ $gigaword_workdir ]; then
-        bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \
+        bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \
               ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1;
       fi
-      bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
+      bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
 	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
     ) || touch $dir/.error &
   done
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
index e1c43d24902..95425c29034 100755
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
@@ -7,8 +7,12 @@
 stage=-1
 lmstage=-2
 train_rnnlm=false
-start_textcleanup=false
-addtraintext=true
+start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder.
+                        # If you already have the normalised gigword text somewhere, you can bypass the
+			# time consuming text cleanup (~1 week) by setting this option false.
+addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to 
+                  # perform the A, A + G, Dev type POCOLM training configuration.
+		  # A=fsp train, G=gigword text, 
 num_words_pocolm=110000
 train_sgmm2=false
 
@@ -45,9 +49,9 @@ if [ $stage -le -1 ]; then
   local/fsp_prepare_dict.sh $spanish_lexicon
   # Let's keep the original dict copy for G2P training
   cp -r data/local/dict data/local/dict_orig
-#  (
-#    steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
-#  ) &
+  (
+    steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
+  ) &
 
   # Added c,j, v to the non silences phones manually
   utils/prepare_lang.sh data/local/dict_orig "<unk>" data/local/lang_orig data/lang_orig
@@ -301,6 +305,6 @@ fi
 wait;
 
 if [ $stage -le 6 ]; then
-  local/chain/run_tdnn_1g.sh --stage 9 --gigaword-workdir $rnnlm_workdir || exit 1;
+  local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1;
 fi
 exit 0;

From 8d521c694f0809cfb058568123fc8355406d1b78 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Mon, 1 Apr 2019 07:18:06 -0400
Subject: [PATCH 32/49] Moved s5_gigaword directory to s5

---
 egs/fisher_callhome_spanish/s5/RESULTS        |  38 ------
 egs/fisher_callhome_spanish/s5/cmd.sh         |   4 +-
 .../s5/local/chain/run_tdnn_1g.sh             |  16 ++-
 .../s5/local/clean_abbrevs_text.py            |  35 +++++
 .../s5/local/clean_txt_dir.sh                 |  57 +++++++++
 egs/fisher_callhome_spanish/s5/local/ctm.sh   |   6 +-
 .../flatten_gigaword/flatten_all_gigaword.sh  |  15 +++
 .../flatten_gigaword/flatten_one_gigaword.py  |  61 +++++++++
 .../s5/local/flatten_gigaword/run_flat.sh     |  17 +++
 .../s5/local/fsp_data_prep.sh                 |   1 +
 .../s5/local/fsp_prepare_dict.sh              |   5 +-
 .../s5/local/get_data_weights.pl              |  39 ++++++
 .../s5/local/get_rnnlm_wordlist.py            |  34 +++++
 .../s5/local/get_unigram_weights_vocab.py     |  33 +++++
 .../s5/local/merge_lexicons.py                |   7 +-
 .../s5/local/pocolm_cust.sh                   | 120 +++++++++++++++++
 egs/fisher_callhome_spanish/s5/local/rnnlm.sh |  83 ++++++++++++
 .../s5/local/rnnlm/train_rnnlm.sh             | 101 ---------------
 .../s5/local/run_norm.sh                      |  36 ++++++
 .../s5/local/train_pocolm.sh                  |  54 ++++++++
 egs/fisher_callhome_spanish/s5/path.sh        |  11 +-
 egs/fisher_callhome_spanish/s5/run.sh         | 121 ++++++++++++------
 egs/fisher_callhome_spanish/s5/steps          |   2 +-
 egs/fisher_callhome_spanish/s5/utils          |   2 +-
 24 files changed, 699 insertions(+), 199 deletions(-)
 delete mode 100644 egs/fisher_callhome_spanish/s5/RESULTS
 create mode 100644 egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py
 create mode 100755 egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh
 create mode 100755 egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
 create mode 100644 egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
 create mode 100755 egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
 create mode 100755 egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
 create mode 100755 egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
 create mode 100644 egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py
 create mode 100755 egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
 create mode 100755 egs/fisher_callhome_spanish/s5/local/rnnlm.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
 create mode 100755 egs/fisher_callhome_spanish/s5/local/run_norm.sh
 create mode 100755 egs/fisher_callhome_spanish/s5/local/train_pocolm.sh

diff --git a/egs/fisher_callhome_spanish/s5/RESULTS b/egs/fisher_callhome_spanish/s5/RESULTS
deleted file mode 100644
index 66613163cea..00000000000
--- a/egs/fisher_callhome_spanish/s5/RESULTS
+++ /dev/null
@@ -1,38 +0,0 @@
---------------------------------------------------------------------------------------
-Triphone with mono alignment (small)
---------------------------------------------------------------------------------------
-%WER 53.70 [ 21570 / 40170, 2618 ins, 6013 del, 12939 sub ] exp/tri1/decode_dev/wer_14_0.0
-
---------------------------------------------------------------------------------------
-Triphone with tri alignments
---------------------------------------------------------------------------------------
-%WER 53.18 [ 21364 / 40170, 2889 ins, 5533 del, 12942 sub ] exp/tri2/decode_dev/wer_13_0.0
-
---------------------------------------------------------------------------------------
-Triphone + LDA + MLLT
---------------------------------------------------------------------------------------
-%WER 46.95 [ 18858 / 40170, 2636 ins, 5197 del, 11025 sub ] exp/tri3a/decode_dev/wer_14_0.0
-
---------------------------------------------------------------------------------------
-+ SAT + fMLLR
---------------------------------------------------------------------------------------
-%WER 42.86 [ 17217 / 40170, 2556 ins, 4633 del, 10028 sub ] exp/tri4a/decode_dev/wer_15_0.0
-
---------------------------------------------------------------------------------------
-+ More leaves and gaussians
---------------------------------------------------------------------------------------
-%WER 40.48 [ 16261 / 40170, 2689 ins, 4130 del, 9442 sub ] exp/tri5a/decode_dev/wer_14_0.0
-
---------------------------------------------------------------------------------------
-+ bMMI + SGMM
---------------------------------------------------------------------------------------
-%WER 38.43 [ 15437 / 40170, 2800 ins, 3685 del, 8952 sub ] exp/sgmm5/decode_dev/wer_10_0.0
-%WER 36.90 [ 14821 / 40170, 2708 ins, 3552 del, 8561 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it1/wer_10_0.0
-%WER 36.09 [ 14499 / 40170, 2511 ins, 3737 del, 8251 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it2/wer_11_0.0
-%WER 35.48 [ 14252 / 40170, 2672 ins, 3370 del, 8210 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it3/wer_10_0.0
-%WER 35.16 [ 14122 / 40170, 2701 ins, 3287 del, 8134 sub ] exp/sgmm5_mmi_b0.1/decode_dev_it4/wer_10_0.0
-
---------------------------------------------------------------------------------------
-pNorm-Ensemble DNN
---------------------------------------------------------------------------------------
-%WER 35.13 [ 14113 / 40170, 2680 ins, 3405 del, 8028 sub ] exp/tri6a_dnn/decode_dev/wer_11_0.0
diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh
index 88db78823a5..db97f1fbc6f 100755
--- a/egs/fisher_callhome_spanish/s5/cmd.sh
+++ b/egs/fisher_callhome_spanish/s5/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 4G"
-export decode_cmd="queue.pl --mem 4G"
+export train_cmd="retry.pl queue.pl --mem 8G"
+export decode_cmd="retry.pl queue.pl --mem 8G"
 export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
index 7f407552c2e..2f478419a18 100755
--- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -27,9 +27,10 @@ nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
 affix=1g   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 common_egs_dir=
 reporting_email=
+gigaword_workdir=
 
 # LSTM/chain options
-train_stage=-10
+train_stage=-20
 xent_regularize=0.1
 dropout_schedule='0,0@0.20,0.3@0.50,0'
 
@@ -156,7 +157,7 @@ if [ $stage -le 19 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
   tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
@@ -201,7 +202,7 @@ fi
 
 
 if [ $stage -le 20 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+  if [[ $(hostname -f) == *.clsp.joujhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
@@ -254,9 +255,10 @@ if [ $stage -le 21 ]; then
 
 fi
 
+# Let's train first a small RNNLM on Fisher train set
 rnnlmdir=exp/rnnlm_lstm_tdnn_1b
 if [ $stage -le 22 ]; then
-  local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
+  rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
 fi
 
 if [ $stage -le 23 ]; then
@@ -277,7 +279,11 @@ if [ $stage -le 23 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
       done
-      bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
+      if [ $gigaword_workdir ]; then
+        bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \
+              ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1;
+      fi
+      bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
 	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
     ) || touch $dir/.error &
   done
diff --git a/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py
new file mode 100644
index 00000000000..7d92eb9fe3a
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/clean_abbrevs_text.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.,
+
+import os, sys
+import re
+import codecs
+
+if len(sys.argv) < 3:
+  print("Usage : python clean_abbrevs_text.py <Input text> <output text>")
+  print("        Processes the text before text normalisation to convert uppercase words as space separated letters")
+  sys.exit()
+
+inputfile=codecs.open(sys.argv[1], encoding='utf-8')
+outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w')
+
+for line in inputfile:
+  words = line.split()
+  textout = ""
+  wordcnt = 0
+  for word in words:
+    if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word):
+      if wordcnt > 0:
+        word = re.sub('\'?s', 's', word)
+        textout = textout + " ".join(word) + " "
+      else:
+        textout = textout + word + " "
+    else:
+      textout = textout + word + " "
+      if word.isalpha(): wordcnt = wordcnt + 1
+  outputfile.write(textout.strip()+ '\n')
+
+inputfile.close()
+outputfile.close() 
diff --git a/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh
new file mode 100755
index 00000000000..1880b3a90cb
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Script to clean up gigaword LM text
+# Removes punctuations, does case normalization
+
+stage=0
+nj=500
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -euo pipefail
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <textdir> <outdir>"
+    exit 1;
+fi
+
+if [ ! -s `which normalizer_main` ] ; then
+  echo "Sparrowhawk normalizer was not found installed !"
+  echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!"
+  exit 1
+fi
+
+txtdir=$1
+textdir=$(realpath $txtdir)
+outdir=$(realpath $2)
+
+workdir=$outdir/tmp
+if [ $stage -le 0 ]; then
+  rm -rf $outdir
+  mkdir -p $workdir
+  mkdir -p $textdir/splits
+  mkdir -p $outdir/data
+  split -l 1000000 $textdir/in.txt $textdir/splits/out
+  numsplits=0
+  for x in $textdir/splits/*; do
+    numsplits=$((numsplits+1))
+    ln -s $x $outdir/data/$numsplits
+  done
+  echo $numsplits
+  cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt .
+  $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
+    local/run_norm.sh \
+    sparrowhawk_configuration.ascii_proto \
+    $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \
+    $outdir/data \
+    JOB \
+    $outdir/sparrowhawk/
+  cat $outdir/sparrowhawk/*.txt | sed "/^$/d"  > $outdir/text_normalized
+
+  # check if numbers are there in normalized output
+  awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \
+    $outdir/text_normalized > $outdir/unique_words
+  grep "[0-9]" $outdir/unique_words | sort -u >  $outdir/numbers
+fi
diff --git a/egs/fisher_callhome_spanish/s5/local/ctm.sh b/egs/fisher_callhome_spanish/s5/local/ctm.sh
index 62860a10b7b..7d09f574580 100755
--- a/egs/fisher_callhome_spanish/s5/local/ctm.sh
+++ b/egs/fisher_callhome_spanish/s5/local/ctm.sh
@@ -19,9 +19,9 @@ fi
 steps/get_ctm.sh $data_dir $lang_dir $decode_dir
 
 # Make sure that channel markers match
-#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
+#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
 
 # Get the environment variables
 . /export/babel/data/software/env.sh
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
new file mode 100755
index 00000000000..242359e7c28
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_all_gigaword.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+# Path to Gigaword corpus with all data files decompressed.
+export GIGAWORDDIR=$1
+# The directory to write output to
+export OUTPUTDIR=$2
+# The number of jobs to run at once
+export NUMJOBS=$3
+
+echo "Flattening Gigaword with ${NUMJOBS} processes..."
+mkdir -p $OUTPUTDIR
+find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \;
+echo "Combining the flattened files into one..."
+cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
new file mode 100644
index 00000000000..29f6766dd84
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/flatten_one_gigaword.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+import re
+import spacy
+import gzip
+
+from argparse import ArgumentParser
+from bs4 import BeautifulSoup
+
+en_nlp = spacy.load("es")
+
+
+def flatten_one_gigaword_file(file_path):
+    f = gzip.open(file_path)
+    html = f.read()
+    # Parse the text with BeautifulSoup
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Iterate over all <p> items and get the text for each.
+    all_paragraphs = []
+    for paragraph in soup("p"):
+        # Turn inter-paragraph newlines into spaces
+        paragraph = paragraph.get_text()
+        paragraph = re.sub(r"\n+", "\n", paragraph)
+        paragraph = paragraph.replace("\n", " ")
+        # Tokenize the paragraph into words
+        tokens = en_nlp.tokenizer(paragraph)
+        words = [str(token) for token in tokens if not
+                 str(token).isspace()]
+        if len(words) < 3:
+            continue
+        all_paragraphs.append(words)
+    # Return a list of strings, where each string is a
+    # space-tokenized paragraph.
+    return [" ".join(paragraph) for paragraph in all_paragraphs]
+
+
+if __name__ == "__main__":
+    log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+    logger = logging.getLogger(__name__)
+
+    parser = ArgumentParser(description=("Flatten a gigaword data file for "
+                                         "use in language modeling."))
+    parser.add_argument("--gigaword-path", required=True,
+                        metavar="<gigaword_path>", type=str,
+                        help=("Path to Gigaword directory, with "
+                              "all .gz files unzipped."))
+    parser.add_argument("--output-dir", required=True, metavar="<output_dir>",
+                        type=str, help=("Directory to write final flattened "
+                                        "Gigaword file."))
+
+    A = parser.parse_args()
+    all_paragraphs = flatten_one_gigaword_file(A.gigaword_path)
+    output_path = os.path.join(A.output_dir,
+                               os.path.basename(A.gigaword_path) + ".flat")
+    with open(output_path, "w") as output_file:
+        for paragraph in all_paragraphs:
+            output_file.write("{}\n".format(paragraph))
diff --git a/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
new file mode 100755
index 00000000000..6b236be0ab9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/flatten_gigaword/run_flat.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -e
+
+. ./path_venv.sh
+
+# Path to Gigaword corpus with all data files decompressed.
+GIGAWORDPATH=$1
+# The directory to write output to
+OUTPUTDIR=$2
+file=$(basename ${GIGAWORDPATH})
+if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then
+    echo "flattening to ${OUTPUTDIR}/${file}.flat"
+    python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR}
+else
+    echo "skipping ${file}.flat"
+fi
+
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
index 11d65da3e95..22b98a6c9db 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
@@ -133,6 +133,7 @@ if [ $stage -le 2 ]; then
   sed 's:</b::g' | \
   sed 's:<foreign langengullís>::g' | \
   sed 's:foreign>::g' | \
+  sed 's:\[noise\]:[noise] :g' | \
   sed 's:>::g' | \
   #How do you handle numbers?
   grep -v '()' | \
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 779298305c4..7b2de2db392 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -105,8 +105,9 @@ if [ $stage -le 4 ]; then
   cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
 
   # Add prons for laughter, noise, oov
-  w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
-  perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.2
+  for w in `grep -v sil $dir/silence_phones.txt`; do
+    sed -i "/\[$w\]/d" $tmpdir/lexicon.2
+  done
 
   for w in `grep -v sil $dir/silence_phones.txt`; do
     echo "[$w] $w"
diff --git a/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
new file mode 100755
index 00000000000..ca5b2a46f8e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_data_weights.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/env perl
+
+# Nagendra Kumar Goel
+
+# This takes two arguments:
+# 1) Pocolm training output folder
+# 2) rnnlm weights file name (for output)
+
+use POSIX;
+use List::Util qw[min max];
+
+if (@ARGV != 2) {
+  die "Usage: get_data_weights.pl <pocolm-folder> <output-file>\n";
+}
+
+$pdir = shift @ARGV;
+$out = shift @ARGV;
+
+open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters";
+open(N, "<$pdir/names") || die "Could not open $pdir/names"  ;
+open(O, ">$out")  || die "Could not open $out for writing" ;
+
+my %scores = ();
+
+while(<N>) {
+    @n = split(/\s/,$_);
+    $name = $n[1];
+    $w = <P>;
+    @w = split(/\s/,$w);
+    $weight = $w[1];
+    $scores{$name} = $weight;
+}
+
+$min = min(values %scores);
+
+for(keys %scores) {
+    $weightout = POSIX::ceil($scores{$_} / $min);
+    print O "$_\t1\t$weightout\n";
+}
diff --git a/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
new file mode 100755
index 00000000000..fc13a7af701
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_rnnlm_wordlist.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 5:
+    print( "Usage: python get_rnnlm_wordlist.py <ASR lexicon words> <POCOLM wordslist> <RNNLM wordslist output> <OOV wordlist>")
+    sys.exit()
+
+lexicon_words = open(sys.argv[1], 'r', encoding="utf-8")
+pocolm_words = open(sys.argv[2], 'r', encoding="utf-8")
+rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8")
+oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8")
+
+line_count=0
+lexicon=[]
+
+for line in lexicon_words:
+    lexicon.append(line.split()[0])
+    rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+    line_count = line_count + 1
+
+for line in pocolm_words:
+    if not line.split()[0] in lexicon:
+        oov_wordlist.write(line.split()[0]+'\n')
+        rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
+        line_count = line_count + 1
+
+lexicon_words.close()
+pocolm_words.close()
+rnnlm_wordsout.close()
+oov_wordlist.close()
diff --git a/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py
new file mode 100644
index 00000000000..3ecd16772d7
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/get_unigram_weights_vocab.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+#    2018  Saikiran Valluri, GoVivace inc.
+
+import os, sys
+
+if len(sys.argv) < 3:
+    print("Usage : python . <pocolmmodelpat> <unigram weights outfile>")
+    print("      Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.")
+    sys.exit()
+ 
+pocolmdir=sys.argv[1]
+unigramwts=open(sys.argv[2], 'w')
+
+names = open(pocolmdir+"/names", 'r')
+metaparams = open(pocolmdir+"/metaparameters", 'r')
+
+name_mapper={}
+for line in names:
+    fields=line.split()
+    name_mapper[fields[0]] = fields[1]
+    
+lns = metaparams.readlines()
+for lineno in range(len(name_mapper.keys())):
+    line = lns[lineno]
+    fileid = line.split()[0].split("_")[-1]
+    weight = line.split()[1]
+    unigramwts.write(name_mapper[fileid] + "  " + weight + "\n")
+
+names.close()
+unigramwts.close()
+metaparams.close()
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index b42eb52d20a..94546dc44c3 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -1,11 +1,12 @@
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-#    2018  Saikiran Valluri, GoVivace inc., Avaaya
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
+#    2018  Saikiran Valluri, GoVivace inc., Avaaya
+
 # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
 from __future__ import print_function
-import sys, re
+import sys
+import re
 import json
 import codecs
 import operator
diff --git a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
new file mode 100755
index 00000000000..0e71be29119
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+
+# this script generates Pocolm-estimated language models with various
+# data sources in data/text folder and places the output in data/lm.
+
+set -euo pipefail
+
+. ./path.sh
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+
+wordlist=None
+num_word=100000
+pocolm_stage=1
+ngram_order=3
+lm_dir=
+arpa_dir=
+textdir=
+max_memory='--max-memory=8G'
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+# If you do not want to set memory limitation for "sort", you can use
+#max_memory=
+# Choices for the max-memory can be:
+# 1) integer + 'K', 'M', 'G', ...
+# 2) integer + 'b', meaning unit is byte and no multiplication
+# 3) integer + '%', meaning a percentage of memory
+# 4) integer, default unit is 'K'
+
+fold_dev_opt=
+# If you want to fold the dev-set in to the 'swbd1' set to produce the final
+# model, un-comment the following line.  For use in the Kaldi example script for
+# ASR, this isn't suitable because the 'dev' set is the first 10k lines of the
+# switchboard data, which we also use as dev data for speech recognition
+# purposes.
+#fold_dev_opt="--fold-dev-into=swbd1"
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 3-gram model running with train_lm.py.
+# the dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.091,0.867,0.753,0.275,0.100,0.018,0.902,0.371,0.183,0.070"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+limit_unk_history_opt=
+# If you want to limit the left of <unk> in the history of a n-gram
+# un-comment the following line
+#limit_unk_history_opt="--limit-unk-history=true"
+
+for order in ${ngram_order}; do
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  lm_name="${num_word}_${order}"
+  min_counts=''
+  # Note: the following might be a more reasonable setting:
+  # min_counts='fisher=2 swbd1=1'
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \
+              --min-counts=${min_counts} \
+              --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \
+              ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  if [ $pocolm_stage -eq 2 ];then
+  mkdir -p ${arpa_dir}
+  format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz
+
+  # example of pruning.  note: the threshold can be less than or more than one.
+  get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  for threshold in 1.0 2.0 4.0; do
+    pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm
+    prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3
+    get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+    format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz
+
+  done
+
+  # example of pruning by size.
+  size=1000000
+  pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm
+  prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes'
+  get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz
+  fi
+done
+
+# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 )
+
+# the following does does some self-testing, including
+# that the computed derivatives are accurate.
+# local/self_test.sh
+
+# perplexities from pocolm-estimated language models with pocolm's interpolation
+# method from orders 3, 4, and 5 are:
+# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689)
+# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797)
+# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181)
+
+# note, the perplexities from pocolm-estimated language models with SRILM's
+# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh),
+# 78.8449 and 75.2202 respectively.
+
+# note, the perplexities from SRILM-estimated language models with SRILM's
+# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh),
+# 78.9056 and 75.5528 respectively.
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh
new file mode 100755
index 00000000000..3850910f312
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/rnnlm.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2015  Guoguo Chen
+#           2017  Hainan Xu
+#           2017  Xiaohui Zhang
+
+# This script trains LMs on the swbd LM-training data.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0.
+# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71
+# Dev objf:   -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91
+
+
+dir=Spanish_gigawrd/rnnlm
+pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned
+wordslist=
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=0
+train_stage=-30
+text_dir=Spanish_gigawrd/text_lm
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+mkdir -p $dir/config
+set -e
+
+for f in $text_dir/dev.txt; do
+    [ ! -f $f ] && \
+	echo "$0: expected file $f to exist;" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+    if [ -f $text_dir/unigram_weights ] ; then
+	mv $text_dir/unigram_weights $pocolm_dir/
+    fi
+    cp $wordslist $dir/config/words.txt
+    n=`cat $dir/config/words.txt | wc -l`
+    echo "<brk> $n" >> $dir/config/words.txt
+
+    # words that are not present in words.txt but are in the training or dev data, will be
+    # mapped to <SPOKEN_NOISE> during training.
+    echo "<unk>" >$dir/config/oov.txt
+    local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt 
+    rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+				 --unk-word="<unk>" \
+				 --data-weights-file=$dir/config/data_weights.txt \
+				 $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+    
+      # choose features
+      rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+			       --use-constant-feature=true \
+			       --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+			       $dir/config/words.txt > $dir/config/features.txt
+fi
+
+if [ $stage -le 1 ]; then
+        cat <<EOF >$dir/config/xconfig 
+	input dim=$embedding_dim name=input
+	relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+	fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+	relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+	fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+	relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+	output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+	rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+    rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+    rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \
+			 --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir
+fi
+
+exit 0
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
deleted file mode 100755
index 3713fe228d6..00000000000
--- a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
-#           2017  Hainan Xu
-#           2017  Ke Li
-
-# This script is similar to rnnlm_lstm_tdnn_a.sh except for adding L2 regularization.
-
-# local/rnnlm/train_rnnlm.sh: best iteration (out of 18) was 17, linking it to final iteration.
-# local/rnnlm/train_rnnlm.sh: train/dev perplexity was 45.6 / 68.7.
-# Train objf: -651.50 -4.44 -4.26 -4.15 -4.08 -4.03 -4.00 -3.97 -3.94 -3.92 -3.90 -3.89 -3.88 -3.86 -3.85 -3.84 -3.83 -3.82
-# Dev objf:   -10.76 -4.68 -4.47 -4.38 -4.33 -4.29 -4.28 -4.27 -4.26 -4.26 -4.25 -4.24 -4.24 -4.24 -4.23 -4.23 -4.23 -4.23
-
-# Begin configuration section.
-dir=exp/rnnlm_lstm_tdnn_1b
-embedding_dim=200
-embedding_l2=0.005 # embedding layer l2 regularize
-comp_l2=0.005 # component-level l2 regularize
-output_l2=0.005 # output-layer l2 regularize
-epochs=90
-mic=
-stage=-10
-train_stage=0
-
-. ./cmd.sh
-. ./utils/parse_options.sh
-[ -z "$cmd" ] && cmd=$train_cmd
-
-train=data/train/text
-dev=data/dev2/text   # We at no stage in run.sh should decode dev2 partition for results!
-wordlist=data/lang/words.txt
-text_dir=data/local/rnnlm/text
-mkdir -p $dir/config
-set -e
-
-for f in $train $dev $wordlist; do
-  [ ! -f $f ] && \
-    echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
-done
-
-if [ $stage -le 0 ]; then
-  mkdir -p $text_dir
-  cat $train | cut -d ' ' -f2- > $text_dir/ami.txt
-  cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
-fi
-
-if [ $stage -le 1 ]; then
-  cp $wordlist $dir/config/
-  n=`cat $dir/config/words.txt | wc -l`
-  echo "<brk> $n" >> $dir/config/words.txt
-
-  # words that are not present in words.txt but are in the training or dev data, will be
-  # mapped to <unk> during training.
-  echo "<unk>" >$dir/config/oov.txt
-
-  cat > $dir/config/data_weights.txt <<EOF
-ami  1   1.0
-EOF
-
-  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
-                             --unk-word="<unk>" \
-                             --data-weights-file=$dir/config/data_weights.txt \
-                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
-
-  # choose features
-  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
-                           --use-constant-feature=true \
-                           --top-word-features 10000 \
-                           --min-frequency 1.0e-03 \
-                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
-                           $dir/config/words.txt > $dir/config/features.txt
-
-lstm_opts="l2-regularize=$comp_l2"
-tdnn_opts="l2-regularize=$comp_l2"
-output_opts="l2-regularize=$output_l2"
-
-  cat >$dir/config/xconfig <<EOF
-input dim=$embedding_dim name=input
-lstm-layer name=lstm1 cell-dim=$embedding_dim $lstm_opts
-relu-renorm-layer name=tdnn dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1))
-lstm-layer name=lstm2 cell-dim=$embedding_dim $lstm_opts
-output-layer name=output $output_opts include-log-softmax=false dim=$embedding_dim
-EOF
-  rnnlm/validate_config_dir.sh $text_dir $dir/config
-fi
-
-if [ $stage -le 2 ]; then
-  # the --unigram-factor option is set larger than the default (100)
-  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
-  # was taking up too much CPU (as much as 10 cores).
-  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200 \
-                             $text_dir $dir/config $dir
-fi
-
-if [ $stage -le 3 ]; then
-  rnnlm/train_rnnlm.sh --embedding_l2 $embedding_l2 \
-                       --stage $train_stage \
-                       --num-epochs $epochs --cmd "$cmd" $dir
-fi
-
-exit 0
diff --git a/egs/fisher_callhome_spanish/s5/local/run_norm.sh b/egs/fisher_callhome_spanish/s5/local/run_norm.sh
new file mode 100755
index 00000000000..f88fecc815c
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/run_norm.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -euo pipefail
+
+punctuation_symbols=( "," "\"" "\`" "\:" "(" ")" "-" ";" "?" "!" "/" "_" "{" "}" "*" )
+
+config=$1
+path_prefix=$2
+data=$3
+job=$4
+dir=$5
+
+substitute_arg=""
+num_syms=0
+
+for i in "${punctuation_symbols[@]}"; do
+    symbol=${punctuation_symbols[${num_syms}]}
+    if [ $num_syms -eq 0 ]; then
+	substitute_arg="sed 's:${i}: :g'"
+    else
+	substitute_arg=$substitute_arg" | sed 's:${i}: :g'"
+    fi
+    substitute_arg=$substitute_arg" |sed 's:${i}$: :g' | sed 's:^${i}: :g'"
+    num_syms=$((num_syms+1))
+done
+mkdir -p $dir/normalize/$job
+local/clean_abbrevs_text.py $data/$job $data/"$job"_processed
+mv $data/"$job"_processed $data/$job
+echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh
+ 
+bash $dir/normalize/$job/substitute.sh | \
+    sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \
+    sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç'  > $dir/normalize/$job/text
+normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh
new file mode 100755
index 00000000000..b8b3ca35ef9
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/train_pocolm.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+stage=-2
+num_words_pocolm=110000
+prune_size=1000000
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -euo pipefail
+
+export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
+export PATH=$PATH:$POCOLM_ROOT/scripts
+
+textdir=$1
+pocolm_dir=$2
+
+
+if [ $stage -le -2 ]; then
+    echo "****"
+    echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model"
+    echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....."
+    echo "****"
+    if [ -e "$textdir"/unigram_weights ]; then
+	rm "$textdir"/unigram_weights
+    fi
+    if [ -e "$pocolm_dir" ]; then
+	rm -r "$pocolm_dir"
+    fi
+    
+    bash local/pocolm_cust.sh  --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \
+	 --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+
+fi
+    
+if [ $stage -le -1 ];then
+    echo "********"
+    echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....."
+    echo "********"
+
+    echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done
+    python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights
+    bash local/pocolm_cust.sh  --num-word "$num_words_pocolm"  --lm-dir "$pocolm_dir"/lm \
+	                       --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
+    prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \
+                        "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"
+    mkdir -p "$pocolm_dir"/arpa
+    format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"  | \
+                                gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz    
+fi
+
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh
index 17ffb0369f8..2993311fd90 100755
--- a/egs/fisher_callhome_spanish/s5/path.sh
+++ b/egs/fisher_callhome_spanish/s5/path.sh
@@ -1,6 +1,13 @@
-export KALDI_ROOT=`pwd`/../../..
+export KALDI_ROOT=`pwd`/../../../
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
+export LD_LIBRARY_PATH=/home/dpovey/libs
+
+export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
+export PATH=$SPARROWHAWK_ROOT/bin:$PATH
 export LC_ALL=C
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs
+export LANG=C
+
+source ~/anaconda/bin/activate py36
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 6e2752a7b68..95425c29034 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -4,14 +4,22 @@
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
 # Recipe for Fisher/Callhome-Spanish
 
-stage=0
-train_stage=-20
+stage=-1
+lmstage=-2
+train_rnnlm=false
+start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder.
+                        # If you already have the normalised gigword text somewhere, you can bypass the
+			# time consuming text cleanup (~1 week) by setting this option false.
+addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to 
+                  # perform the A, A + G, Dev type POCOLM training configuration.
+		  # A=fsp train, G=gigword text, 
+num_words_pocolm=110000
 train_sgmm2=false
 
 # call the next line with the directory where the Spanish Fisher data is
 # (the values below are just an example).
 sfisher_speech=/export/corpora/LDC/LDC2010S01
-sfisher_transcripts=/export/corpora/LDC/LDC2010T04
+sfisher_transcripts=/export/c03/svalluri//LDC2010T04
 spanish_lexicon=/export/corpora/LDC/LDC96L16
 split=local/splits/split_fisher
 
@@ -19,15 +27,17 @@ callhome_speech=/export/corpora/LDC/LDC96S35
 callhome_transcripts=/export/corpora/LDC/LDC96T17
 split_callhome=local/splits/split_callhome
 
+gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data
+rnnlm_workdir=workdir_rnnlm_Spanish_08032019
 mfccdir=`pwd`/mfcc
 
 . ./cmd.sh
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
-set -e
+set -eou pipefail
 
-if [ $stage -le 1 ]; then
+if [ $stage -le -1 ]; then
   local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
 
   local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
@@ -37,19 +47,14 @@ if [ $stage -le 1 ]; then
   # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
   # wordlist is downloaded if it is not available.
   local/fsp_prepare_dict.sh $spanish_lexicon
+  # Let's keep the original dict copy for G2P training
+  cp -r data/local/dict data/local/dict_orig
+  (
+    steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
+  ) &
 
   # Added c,j, v to the non silences phones manually
-  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
-
-  # Make sure that you do not use your test and your dev sets to train the LM
-  # Some form of cross validation is possible where you decode your dev/set based on an
-  # LM that is trained on  everything but that that conversation
-  # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
-  # to get the numbers. Depending on your needs, you might have to change the size of
-  # the splits within that file. The default paritions are based on the Kaldi + Joshua
-  # requirements which means that I have very large dev and test sets
-  local/fsp_train_lms.sh $split
-  local/fsp_create_test_lang.sh
+  utils/prepare_lang.sh data/local/dict_orig "<unk>" data/local/lang_orig data/lang_orig
 
   utils/fix_data_dir.sh data/local/data/train_all
 
@@ -70,34 +75,65 @@ if [ $stage -le 1 ]; then
 
   cp -r data/local/data/callhome_train_all data/callhome_train_all
 
-  # Creating data partitions for the pipeline
-  # We need datasets for both the ASR and SMT system
-  # We have 257455 utterances left, so the partitions are roughly as follows
-  # ASR Train : 100k utterances
-  # ASR Tune : 17455 utterances
-  # ASR Eval : 20k utterances
-  # MT Train : 100k utterances
-  # MT Tune : Same as the ASR eval set (Use the lattices from here)
-  # MT Eval : 20k utterances
-  # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
-  # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
-  # As noted above, the LM has not been trained on the dev and the test sets.
-  #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
-  #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
-  #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
-  #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
-  #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
-  #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
-  #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
-  #rm -r data/dev_and_test
-  #rm -r data/asr_dev_and_test
-  #rm -r data/mt_train_and_test
-
   local/create_splits.sh $split
   local/callhome_create_splits.sh $split_callhome
+  
 fi
 
+if $start_textcleanup; then
+  echo "WARNING : Starting from cleaning up and normalizing the Gigword text"
+  echo "          This might take few days........... You can opt out this stage "
+  echo "          by setting start_textcleanup=false, and having text_lm ready inside rnnlm_workdir."
+
+  if [ $stage -le 0 ]; then
+    mkdir -p "$rnnlm_workdir"/gigaword_rawtext
+    local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath"  "$rnnlm_workdir"/flattened_gigaword_corpus 24
+    cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt
+    local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/  \
+			   "$rnnlm_workdir"/normalised_gigaword_corpus/
+    mkdir -p "$rnnlm_workdir"/text_lm
+    cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt
+    cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt  # For RNNLM and POCOLM training we use dev2/text as dev file.
+    cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+    if $addtraintext; then
+        cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
+    fi
+  fi
+fi
+
+if [ $stage -le 1 ]; then
+    local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
+    local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
+				"$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords
+    if $train_rnnlm; then
+        local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
+		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm
+    fi
+fi
+
+
 if [ $stage -le 2 ]; then
+  wait # wait till G2P training finishes
+  if [ -f exp/g2p/.error ]; then
+     rm exp/g2p/.error || true
+     echo "Fail to train the G2P model." && exit 1;
+  fi
+  steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex
+  cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^$/d" |sort | uniq  > "$rnnlm_workdir"/lexicon_extended.txt
+  cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version.
+ 
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+  # Make sure that you do not use your test and your dev sets to train the LM
+  # Some form of cross validation is possible where you decode your dev/set based on an
+  # LM that is trained on  everything but that that conversation
+  # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
+  # to get the numbers. Depending on your needs, you might have to change the size of
+  # the splits within that file. The default paritions are based on the Kaldi + Joshua
+  # requirements which means that I have very large dev and test sets
+  local/fsp_train_lms.sh $split
+  local/fsp_create_test_lang.sh
+
   # Now compute CMVN stats for the train, dev and test subsets
   steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
   steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
@@ -264,8 +300,11 @@ for iter in 1 2 3 4; do
     data/lang_test data/dev/  exp/sgmm5/decode_dev $decode
 done
 ) &
-
 fi
 
-local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1;
+wait;
+
+if [ $stage -le 6 ]; then
+  local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1;
+fi
 exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/steps b/egs/fisher_callhome_spanish/s5/steps
index 6e99bf5b5ad..1b186770dd1 120000
--- a/egs/fisher_callhome_spanish/s5/steps
+++ b/egs/fisher_callhome_spanish/s5/steps
@@ -1 +1 @@
-../../wsj/s5/steps
\ No newline at end of file
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5/utils b/egs/fisher_callhome_spanish/s5/utils
index b240885218f..a3279dc8679 120000
--- a/egs/fisher_callhome_spanish/s5/utils
+++ b/egs/fisher_callhome_spanish/s5/utils
@@ -1 +1 @@
-../../wsj/s5/utils
\ No newline at end of file
+../../wsj/s5/utils/
\ No newline at end of file

From f61047074ffc0cf35afbe3535c29d5e19a4c3c9a Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Tue, 2 Apr 2019 05:44:59 -0400
Subject: [PATCH 33/49] removed s5_gigaword folder

---
 .../s5_gigaword/cmd.sh                        |  15 -
 .../s5_gigaword/conf/decode.config            |   6 -
 .../s5_gigaword/conf/mfcc.conf                |   2 -
 .../s5_gigaword/conf/mfcc_hires.conf          |  10 -
 .../s5_gigaword/conf/online_cmvn.conf         |   1 -
 .../s5_gigaword/conf/plp.conf                 |   2 -
 .../local/callhome_create_splits.sh           |  31 -
 .../s5_gigaword/local/callhome_data_prep.sh   | 163 ----
 .../s5_gigaword/local/callhome_get_1_best.py  |  75 --
 .../local/callhome_get_lattices.py            | 115 ---
 .../local/callhome_make_spk2gender.sh         |  29 -
 .../s5_gigaword/local/callhome_make_trans.pl  |  74 --
 .../s5_gigaword/local/callhome_text_pp.sh     |   9 -
 .../s5_gigaword/local/chain/run_tdnn_1g.sh    | 294 -------
 .../s5_gigaword/local/clean_abbrevs_text.py   |  35 -
 .../s5_gigaword/local/clean_txt_dir.sh        |  57 --
 .../s5_gigaword/local/create_oracle_ctm.sh    |  30 -
 .../s5_gigaword/local/create_splits.sh        |  30 -
 .../s5_gigaword/local/ctm.sh                  |  34 -
 .../s5_gigaword/local/decode_report.py        | 148 ----
 .../s5_gigaword/local/find_unique_phones.pl   |  25 -
 .../s5_gigaword/local/fix_stm.sh              |  10 -
 .../flatten_gigaword/flatten_all_gigaword.sh  |  15 -
 .../flatten_gigaword/flatten_one_gigaword.py  |  61 --
 .../local/flatten_gigaword/run_flat.sh        |  17 -
 .../s5_gigaword/local/fsp_create_test_lang.sh |  49 --
 .../s5_gigaword/local/fsp_data_prep.sh        | 176 ----
 .../local/fsp_ideal_data_partitions.pl        |  85 --
 .../s5_gigaword/local/fsp_make_spk2gender.sh  |  29 -
 .../s5_gigaword/local/fsp_make_trans.pl       |  81 --
 .../s5_gigaword/local/fsp_prepare_dict.sh     | 142 ----
 .../s5_gigaword/local/fsp_train_lms.sh        | 140 ----
 .../s5_gigaword/local/get_1_best.py           |  62 --
 .../s5_gigaword/local/get_data_weights.pl     |  39 -
 .../s5_gigaword/local/get_lattices.py         | 115 ---
 .../s5_gigaword/local/get_oracle.sh           |  32 -
 .../s5_gigaword/local/get_rnnlm_wordlist.py   |  34 -
 .../local/get_unigram_weights_vocab.py        |  33 -
 .../s5_gigaword/local/isolate_phones.pl       |  66 --
 .../s5_gigaword/local/latconvert.sh           | 124 ---
 .../s5_gigaword/local/merge_lexicons.py       |  65 --
 .../s5_gigaword/local/monitor_denlats.sh      |  31 -
 .../local/nnet3/run_ivector_common.sh         | 187 -----
 .../s5_gigaword/local/pocolm_cust.sh          | 120 ---
 .../s5_gigaword/local/process_oracle.py       |  64 --
 .../s5_gigaword/local/rescore.sh              |  24 -
 .../s5_gigaword/local/rnnlm.sh                |  83 --
 .../s5_gigaword/local/run_norm.sh             |  36 -
 .../s5_gigaword/local/run_sgmm2x.sh           |  57 --
 .../s5_gigaword/local/score.sh                |   1 -
 .../s5_gigaword/local/score_oracle.sh         |  29 -
 .../s5_gigaword/local/splits/dev              |  20 -
 .../local/splits/split_callhome/dev           |  20 -
 .../local/splits/split_callhome/test          |  20 -
 .../local/splits/split_callhome/train         |  80 --
 .../s5_gigaword/local/splits/split_fisher/dev |  20 -
 .../local/splits/split_fisher/dev2            |  20 -
 .../local/splits/split_fisher/test            |  20 -
 .../local/splits/split_fisher/train           | 759 ------------------
 .../s5_gigaword/local/splits/test             |  20 -
 .../s5_gigaword/local/splits/train            |  80 --
 .../s5_gigaword/local/spron.pl                | 304 -------
 .../s5_gigaword/local/subset_data_prep.sh     | 164 ----
 .../s5_gigaword/local/train_get_1_best.py     |  79 --
 .../s5_gigaword/local/train_get_lattices.py   | 125 ---
 .../s5_gigaword/local/train_pocolm.sh         |  54 --
 .../s5_gigaword/local/train_process_oracle.py |  79 --
 .../s5_gigaword/local/wer_output_filter       |   5 -
 .../s5_gigaword/path.sh                       |  13 -
 egs/fisher_callhome_spanish/s5_gigaword/rnnlm |   1 -
 .../s5_gigaword/run.sh                        | 310 -------
 egs/fisher_callhome_spanish/s5_gigaword/steps |   1 -
 egs/fisher_callhome_spanish/s5_gigaword/utils |   1 -
 73 files changed, 5387 deletions(-)
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/cmd.sh
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh
 delete mode 120000 egs/fisher_callhome_spanish/s5_gigaword/local/score.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/test
 delete mode 100644 egs/fisher_callhome_spanish/s5_gigaword/local/splits/train
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/path.sh
 delete mode 120000 egs/fisher_callhome_spanish/s5_gigaword/rnnlm
 delete mode 100755 egs/fisher_callhome_spanish/s5_gigaword/run.sh
 delete mode 120000 egs/fisher_callhome_spanish/s5_gigaword/steps
 delete mode 120000 egs/fisher_callhome_spanish/s5_gigaword/utils

diff --git a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh b/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh
deleted file mode 100755
index db97f1fbc6f..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/cmd.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-# you can change cmd.sh depending on what type of queue you are using.
-# If you have no queueing system and want to run on a local machine, you
-# can change all instances 'queue.pl' to run.pl (but be careful and run
-# commands one by one: most recipes will exhaust the memory on your
-# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
-# with slurm.  Different queues are configured differently, with different
-# queue names and different ways of specifying things like memory;
-# to account for these differences you can create and edit the file
-# conf/queue.conf to match your queue's configuration.  Search for
-# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
-# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
-
-export train_cmd="retry.pl queue.pl --mem 8G"
-export decode_cmd="retry.pl queue.pl --mem 8G"
-export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config b/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config
deleted file mode 100644
index 7908f178373..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/conf/decode.config
+++ /dev/null
@@ -1,6 +0,0 @@
-# Use wider-than-normal decoding beams.
-first_beam=16.0
-beam=20.0
-lat_beam=10.0
-min_lmwt=2
-max_lmwt=10
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf
deleted file mode 100644
index ffb41a1aae4..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc.conf
+++ /dev/null
@@ -1,2 +0,0 @@
---use-energy=false   # only non-default option.
---sample-frequency=8000
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf
deleted file mode 100644
index d870ab04c38..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/conf/mfcc_hires.conf
+++ /dev/null
@@ -1,10 +0,0 @@
-# config for high-resolution MFCC features, intended for neural network training.
-# Note: we keep all cepstra, so it has the same info as filterbank features,
-# but MFCC is more easily compressible (because less correlated) which is why
-# we prefer this method.
---use-energy=false   # use average of log energy, not energy.
---sample-frequency=8000 #  Switchboard is sampled at 8kHz
---num-mel-bins=40     # similar to Google's setup.
---num-ceps=40     # there is no dimensionality reduction.
---low-freq=40    # low cutoff frequency for mel bins
---high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf
deleted file mode 100644
index 7748a4a4dd3..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/conf/online_cmvn.conf
+++ /dev/null
@@ -1 +0,0 @@
-# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf b/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf
deleted file mode 100644
index c4b73674cab..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/conf/plp.conf
+++ /dev/null
@@ -1,2 +0,0 @@
-# No non-default options for now.
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh
deleted file mode 100755
index 07814da46a9..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_create_splits.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-data_dir=data
-train_all=data/callhome_train_all
-
-if [ $# -lt 1 ]; then
-    echo "Specify the location of the split files"
-    exit 1;
-fi
-
-splitFile=$1
-
-# Train first
-for split in train dev test
-do
-  dirName=callhome_$split
-
-  cp -r $train_all $data_dir/$dirName
-
-  awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \
-  $splitFile/$split $train_all/segments > $data_dir/$dirName/segments
-
-  n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $data_dir/$dirName/segments | sort | uniq | wc -l`
-
-  echo "$n conversations left in split $dirName"
-
-  utils/fix_data_dir.sh $data_dir/$dirName
-  utils/validate_data_dir.sh $data_dir/$dirName
-done
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh
deleted file mode 100755
index f61b0fa9519..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_data_prep.sh
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-# The input is the Callhome Spanish Dataset. (*.sph files)
-# In addition the transcripts are needed as well.
-# To be run from one directory above this script.
-
-# Note: when creating your own data preparation scripts, it's a good idea
-# to make sure that the speaker id (if present) is a prefix of the utterance
-# id, that the output scp file is sorted on utterance id, and that the
-# transcription file is exactly the same length as the scp file and is also
-# sorted on utterance id (missing transcriptions should be removed from the
-# scp file using e.g. scripts/filter_scp.pl)
-
-stage=0
-
-export LC_ALL=C
-
-
-if [ $# -lt 2 ]; then
-   echo "Arguments should be the location of the Callhome Spanish Speech and Transcript Directories, se
-e ../run.sh for example."
-   exit 1;
-fi
-
-cdir=`pwd`
-dir=`pwd`/data/local/data
-local=`pwd`/local
-utils=`pwd`/utils
-tmpdir=`pwd`/data/local/tmp
-
-. ./path.sh || exit 1; # Needed for KALDI_ROOT
-export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
-if [ ! -x $sph2pipe ]; then
-   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
-   exit 1;
-fi
-cd $dir
-
-# Make directory of links to the WSJ disks such as 11-13.1.  This relies on the command
-# line arguments being absolute pathnames.
-#rm -r links/ 2>/dev/null
-mkdir -p links/
-ln -s $* links
-
-# Basic spot checks to see if we got the data that we needed
-if [ ! -d links/LDC96S35 -o ! -d links/LDC96T17 ];
-then
-        echo "The speech and the data directories need to be named LDC96S35 and LDC96T17 respecti
-vely"
-        exit 1;
-fi
-
-if [ ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST -o ! -d links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN ];
-then
-        echo "Dev, Eval or Train directories missing or not properly organised within the speech data dir"
-        exit 1;
-fi
-
-#Check the transcripts directories as well to see if they exist
-if [ ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest -o ! -d links/LDC96T17/callhome_spanish_trans_970711/transcrp/train ]
-then
-        echo "Transcript directories missing or not properly organised"
-        exit 1;
-fi
-
-speech_train=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/TRAIN
-speech_dev=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/DEVTEST
-speech_test=$dir/links/LDC96S35/CALLHOME/SPANISH/SPEECH/EVLTEST
-transcripts_train=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/train
-transcripts_dev=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/devtest
-transcripts_test=$dir/links/LDC96T17/callhome_spanish_trans_970711/transcrp/evltest
-
-fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l`
-fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l`
-fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l`
-fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l`
-fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l`
-fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l`
-
-#Now check if we got all the files that we needed
-if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ];
-then
-        echo "Incorrect number of files in the data directories"
-        echo "The paritions should contain 80/20/20 files"
-        exit 1;
-fi
-
-if [ $stage -le 0 ]; then
-  #Gather all the speech files together to create a file list
-  (
-      find $speech_train -iname '*.sph';
-      find $speech_dev -iname '*.sph';
-      find $speech_test -iname '*.sph';
-  )  > $tmpdir/callhome_train_sph.flist
-
-  #Get all the transcripts in one place
-
-  (
-    find $transcripts_train -iname '*.txt';
-    find $transcripts_dev -iname '*.txt';
-    find $transcripts_test -iname '*.txt';
-  )  > $tmpdir/callhome_train_transcripts.flist
-
-fi
-
-if [ $stage -le 1 ]; then
-  $local/callhome_make_trans.pl $tmpdir
-  mkdir -p $dir/callhome_train_all
-  mv $tmpdir/callhome_reco2file_and_channel $dir/callhome_train_all/
-fi
-
-if [ $stage -le 2 ]; then
-  sort $tmpdir/callhome.text.1 | sed 's/^\s\s*|\s\s*$//g' | sed 's/\s\s*/ /g' > $dir/callhome_train_all/callhome.text
-
-  #Create segments file and utt2spk file
-  ! cat $dir/callhome_train_all/callhome.text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/callhome_train_all/callhome_utt2spk \
-  && echo "Error producing utt2spk file" && exit 1;
-
-  cat $dir/callhome_train_all/callhome.text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2;
- $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' >$dir/callhome_train_all/callhome_segments
-
-  $utils/utt2spk_to_spk2utt.pl <$dir/callhome_train_all/callhome_utt2spk > $dir/callhome_train_all/callhome_spk2utt
-fi
-
-if [ $stage -le 3 ]; then
-  for f in `cat $tmpdir/callhome_train_sph.flist`; do
-    # convert to absolute path
-    make_absolute.sh $f
-  done > $tmpdir/callhome_train_sph_abs.flist
-
-  cat $tmpdir/callhome_train_sph_abs.flist | perl -ane 'm:/([^/]+)\.SPH$: || die "bad line $_; ";  print lc($1)," $_"; ' > $tmpdir/callhome_sph.scp
-  cat $tmpdir/callhome_sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
-  sort -k1,1 -u  > $dir/callhome_train_all/callhome_wav.scp || exit 1;
-fi
-
-if [ $stage -le 4 ]; then
-  # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl.
-  cd $cdir
-  #TODO: needs to be rewritten
-  $local/callhome_make_spk2gender.sh > $dir/callhome_train_all/callhome_spk2gender
-fi
-
-# Rename files from the callhome directory
-if [ $stage -le 5 ]; then
-    cd $dir/callhome_train_all
-    mv callhome.text text
-    mv callhome_segments segments
-    mv callhome_spk2utt spk2utt
-    mv callhome_wav.scp wav.scp
-    mv callhome_reco2file_and_channel reco2file_and_channel
-    mv callhome_spk2gender spk2gender
-    mv callhome_utt2spk utt2spk
-    cd $cdir
-fi
-
-fix_data_dir.sh $dir/callhome_train_all || exit 1
-utils/validate_data_dir.sh --no-feats $dir/callhome_train_all || exit 1
-
-echo "CALLHOME spanish Data preparation succeeded."
-
-exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py
deleted file mode 100755
index a81818c2858..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_1_best.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-# Extracts one best output for a set of files
-# The list of files in the conversations for which 1 best output has to be extracted
-# words.txt
-
-import os
-import sys
-
-def findTranscription(timeDetail):
-  file1 = open('exp/tri5a/decode_callhome_dev/scoring/13.tra')
-  file2 = open('exp/tri5a/decode_callhome_train/scoring/13.tra')
-  for line in file1:
-    lineComp = line.split()
-    if lineComp[0] == timeDetail:
-      return " ".join(lineComp[1:])
-  for line in file2:
-    lineComp = line.split()
-    if lineComp[0] == timeDetail:
-      return " ".join(lineComp[1:])
-  # No result found
-  return -1
-
-
-wordsFile = open('exp/tri5a/graph/words.txt')
-words = {}
-
-# Extract word list
-for line in wordsFile:
-  lineComp = line.split()
-  words[int(lineComp[1])] = lineComp[0].strip()
-
-# Now read list of files in conversations
-fileList = []
-#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev')
-conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/train')
-for line in conversationList:
-    line = line.strip()
-    line = line[:-4]
-    fileList.append(line)
-
-# IN what order were the conversations added to the spanish files?
-# TODO: Make sure they match the order in which these english files are being written
-
-# Now get timing information to concatenate the ASR outputs
-if not os.path.exists('exp/tri5a/one-best/ch_train'):
-  os.makedirs('exp/tri5a/one-best/ch_train')
-
-#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+')
-provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/asr.train', 'w+')
-for item in fileList:
-  timingFile = open('/export/a04/gkumar/corpora/fishcall/callhome/tim/' + item + '.es')
-  newFile = open('exp/tri5a/one-best/ch_train/' + item + '.es', 'w+')
-  for line in timingFile:
-    timeInfo = line.split()
-    mergedTranslation = ""
-    for timeDetail in timeInfo:
-      #Locate this in ASR dev/test, this is going to be very slow
-      tmp = findTranscription(timeDetail)
-      if tmp != -1:
-        mergedTranslation = mergedTranslation + " " + tmp
-    mergedTranslation = mergedTranslation.strip()
-    transWords = [words[int(x)] for x in mergedTranslation.split()]
-    newFile.write(" ".join(transWords) + "\n")
-    provFile.write(" ".join(transWords) + "\n")
-
-  newFile.close()
-provFile.close()
-
-
-
-
-
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py
deleted file mode 100755
index 4c96e01ce7e..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_get_lattices.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-# Extracts one best output for a set of files
-# The list of files in the conversations for which 1 best output has to be extracted
-# words.txt
-
-from __future__ import print_function
-import os
-import sys
-import subprocess
-
-latticeLocation = 'latjosh-2-callhome/lattices-pushed/'
-
-tmpdir = 'data/local/data/tmp/ch-d/lattmp'
-invalidplfdir = 'data/local/data/tmp/ch-d/invalidplf'
-symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt'
-
-conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/dev')
-provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/asr.test.plf', 'w+')
-invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/invalidPLF', 'w+')
-blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/blankPLF', 'w+')
-rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/removeLines', 'w+')
-
-if not os.path.exists(tmpdir):
-    os.makedirs(tmpdir)
-if not os.path.exists(invalidplfdir):
-    os.makedirs(invalidplfdir)
-else:
-    os.system("rm " + invalidplfdir + "/*")
-
-def latticeConcatenate(lat1, lat2):
-    '''
-    Concatenates lattices, writes temporary results to tmpdir
-    '''
-    if lat1 == "":
-        os.system('rm ' + tmpdir + '/tmp.lat')
-        return lat2
-    else:
-        proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')])
-        proc.wait()
-        return tmpdir + '/tmp.lat'
-
-
-def findLattice(timeDetail):
-    '''
-    Finds the lattice corresponding to a time segment
-    '''
-    if os.path.isfile(latticeLocation + timeDetail + '.lat'):
-        return latticeLocation + timeDetail + '.lat'
-    else:
-        return -1
-
-
-# Now read list of files in conversations
-fileList = []
-for line in conversationList:
-    line = line.strip()
-    line = line[:-4]
-    fileList.append(line)
-
-# IN what order were the conversations added to the spanish files?
-# Now get timing information to concatenate the ASR outputs
-
-lineNo = 1
-for item in fileList:
-    timingFile = open('/export/a04/gkumar/corpora/fishcall/callhome/tim/' + item + '.es')
-    for line in timingFile:
-        timeInfo = line.split()
-
-        # For utterances that are concatenated in the translation file,
-        # the corresponding FSTs have to be translated as well
-        mergedTranslation = ""
-        for timeDetail in timeInfo:
-            tmp = findLattice(timeDetail)
-            if tmp != -1:
-                # Concatenate lattices
-                mergedTranslation = latticeConcatenate(mergedTranslation, tmp)
-
-        print(mergedTranslation)
-        if mergedTranslation != "":
-
-            # Sanjeev's Recipe : Remove epsilons and topo sort
-            finalFST = tmpdir + "/final.fst"
-            os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST)
-
-            # Now convert to PLF
-            proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable +  ' ' + finalFST, stdout=subprocess.PIPE, shell=True)
-            PLFline = proc.stdout.readline()
-            finalPLFFile = tmpdir + "/final.plf"
-            finalPLF = open(finalPLFFile, "w+")
-            finalPLF.write(PLFline)
-            finalPLF.close()
-
-            # now check if this is a valid PLF, if not write it's ID in a
-            # file so it can be checked later
-            proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
-            line = proc.stdout.readline()
-            print("{} {}".format(line, lineNo))
-            if line.strip() != "PLF format appears to be correct.":
-                os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
-                invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
-                rmLines.write("{}\n".format(lineNo))
-            else:
-                provFile.write(PLFline)
-        else:
-            blankPLF.write(timeInfo[0] + "\n")
-            rmLines.write("{}\n".format(lineNo))
-        # Now convert to PLF
-        lineNo += 1
-
-provFile.close()
-invalidPLF.close()
-blankPLF.close()
-rmLines.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh
deleted file mode 100755
index d06e5fe911f..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_spk2gender.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-# Gets the unique speakers from the file created by fsp_make_trans.pl
-# Note that if a speaker appears multiple times, it is categorized as female
-
-import os
-import sys
-
-tmpFileLocation = 'data/local/tmp/callhome_spk2gendertmp'
-
-tmpFile = None
-
-try:
-     tmpFile = open(tmpFileLocation)
-except IOError:
-    print 'The file spk2gendertmp does not exist. Run fsp_make_trans.pl first?'
-
-speakers = {}
-
-for line in tmpFile:
-    comp = line.split(' ')
-    if comp[0] in speakers:
-        speakers[comp[0]] = "f"
-    else:
-        speakers[comp[0]] = comp[1]
-
-for speaker, gender in speakers.iteritems():
-    print speaker + " " + gender
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl
deleted file mode 100755
index ec3dfd88037..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_make_trans.pl
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-use utf8;
-use File::Basename;
-
-($tmpdir)=@ARGV;
-$trans="$tmpdir/callhome_train_transcripts.flist";
-$reco="$tmpdir/callhome_reco2file_and_channel";
-open(T, "<", "$trans") || die "Can't open transcripts file";
-open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!";
-open(O, ">$tmpdir/callhome.text.1") || die "Can't open text file for writing";
-open(G, ">$tmpdir/callhome_spk2gendertmp") || die "Can't open the speaker to gender map file";
-binmode(O, ":utf8");
-while (<T>) {
-  $file = $_;
-  m:([^/]+)\.txt: || die "Bad filename $_";
-  $call_id = $1;
-  print R "$call_id-A $call_id A\n";
-  print R "$call_id-B $call_id B\n";
-  open(I, "<$file") || die "Opening file $_";
-  binmode(I, ":iso88591");
-  #Now read each line and extract information
-  while (<I>) {
-        #136.37 138.10 B: Ah, bueno, mamita.
-    chomp;
-
-    my @stringComponents = split(":", $_, 2);
-          my @timeInfo = split(" ", $stringComponents[0]);
-          $stringComponents[1] =~ s/^\s+|\s+$//g ;
-          my $words = $stringComponents[1];
-    #Check number of components in this array
-    if ((scalar @stringComponents) >= 2) {
-      $start = sprintf("%06d", $timeInfo[0] * 100);
-      $end = sprintf("%06d", $timeInfo[1] * 100);
-      length($end) > 6 && die "Time too long $end in $file";
-      $side = "A";
-      if (index($timeInfo[2], "B") != -1) {
-        $side = "B";
-      }
-      $utt_id = "${call_id}-$side-$start-$end";
-      $speaker_id = "${call_id}-$side";
-      # All speakers are treated as male because speaker gender info
-      # is missing in this file
-      $gender = "m";
-      print G "$speaker_id $gender\n" || die "Error writing to speaker2gender file";
-                        $words =~ s|\[\[[^]]*\]\]||g;    #removes comments
-                        $words =~ s|\{laugh\}|\$laughter\$|g;    # replaces laughter tmp
-                        $words =~ s|\[laugh\]|\$laughter\$|g;    # replaces laughter tmp
-                        $words =~ s|\{[^}]*\}|\[noise\]|g;       # replaces noise
-                        $words =~ s|\[[^]]*\]|\[noise\]|g;       # replaces noise
-                        $words =~ s|\[/*([^]]*)\]|\[noise\]|g;   # replaces end of noise
-                        $words =~ s|\$laughter\$|\[laughter\]|g; # replaces laughter again
-                        $words =~ s|\(\(([^)]*)\)\)|\1|g;        # replaces unintelligible speech
-                        $words =~ s|<\?([^>]*)>|\1|g;            # for unrecognized language
-                        $words =~ s|background speech|\[noise\]|g;
-                        $words =~ s|background noise|\[noise\]|g;
-                        $words =~ s/\[/larrow/g;
-                        $words =~ s/\]/rarrow/g;
-                        $words =~ s/[[:punct:]]//g;
-                        $words =~ s/larrow/\[/g;
-                        $words =~ s/rarrow/\]/g;
-      $words =~ s/[¿¡]//g;
-                        $words =~ s/\h+/ /g;                     # horizontal whitespace characters
-      $words = lc($words);
-      print O "$utt_id $words\n" || die "Error writing to text file";
-    }
-  }
-  close(I);
-}
-close(T);
-close(R);
-close(O);
-close(G);
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh
deleted file mode 100755
index 37e1eca1687..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/callhome_text_pp.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-if [ $# -gt 0 ]; then
-    sentence=$1
-    echo $sentence | sed 's:{^[}]*}:[noise]:'
-fi
-
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
deleted file mode 100755
index 2f478419a18..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/chain/run_tdnn_1g.sh
+++ /dev/null
@@ -1,294 +0,0 @@
-#!/bin/bash
-
-# 1g is like 1f but upgrading to a "resnet-style TDNN-F model", i.e.
-#   with bypass resnet connections, and re-tuned.
-# compute-wer --text --mode=present ark:exp/chain/multipsplice_tdnn/decode_fsp_train_test/scoring_kaldi/test_filt.txt ark,p:- 
-# %WER 22.21 [ 8847 / 39831, 1965 ins, 2127 del, 4755 sub ]
-# %SER 56.98 [ 3577 / 6278 ]
-# Scored 6278 sentences, 0 not present in hyp.
-
-# steps/info/chain_dir_info.pl  exp/chain/multipsplice_tdnn
-# exp/chain/multipsplice_tdnn: num-iters=296 nj=1..2 num-params=8.2M dim=40+100->2489 combine=-0.170->-0.165 (over 8) xent:train/valid[196,295,final]=(-2.30,-1.93,-1.83/-2.24,-1.96,-1.86) logprob:train/valid[196,295,final]=(-0.208,-0.169,-0.164/-0.189,-0.161,-0.158)
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-train_set=train
-test_sets="test dev"
-gmm=tri5a        # this is the source gmm-dir that we'll use for alignments; it
-                 # should have alignments for the specified training data.
-num_threads_ubm=32
-nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
-
-# Options which are not passed through to run_ivector_common.sh
-affix=1g   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
-common_egs_dir=
-reporting_email=
-gigaword_workdir=
-
-# LSTM/chain options
-train_stage=-20
-xent_regularize=0.1
-dropout_schedule='0,0@0.20,0.3@0.50,0'
-
-# training chunk-options
-chunk_width=140,100,160
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
-
-# training options
-srand=0
-remove_egs=true
-
-#decode options
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-if [ $stage -le 15 ]; then
-echo "local/nnet3/run_ivector_common.sh \
-  --stage $stage --nj $nj \
-  --train-set $train_set --gmm $gmm \
-  --num-threads-ubm $num_threads_ubm \
-  --nnet3-affix "$nnet3_affix""
-
-local/nnet3/run_ivector_common.sh \
-  --stage $stage --nj $nj \
-  --train-set $train_set --gmm $gmm \
-  --num-threads-ubm $num_threads_ubm \
-  --nnet3-affix "$nnet3_affix"
-
-fi
-
-
-gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp
-lat_dir=exp/tri5a_lats_nodup_sp
-dir=exp/chain/multipsplice_tdnn
-train_data_dir=data/${train_set}_sp_hires
-train_ivector_dir=exp/nnet3/ivectors_train_sp_hires
-lores_train_data_dir=data/${train_set}_sp
-
-# note: you don't necessarily have to change the treedir name
-# each time you do a new experiment-- only if you change the
-# configuration in a way that affects the tree.
-tree_dir=exp/chain/${gmm}_tree
-# the 'lang' directory is created by this script.
-# If you create such a directory with a non-standard topology
-# you should probably name it differently.
-lang=data/lang_${gmm}_chain
-
-#for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-#    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
-#    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-#  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-#done
-
-
-if [ $stage -le 16 ]; then
-  echo "$0: creating lang directory $lang with chain-type topology"
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d $lang ]; then
-    if [ $lang/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: $lang already exists, not overwriting it; continuing"
-    else
-      echo "$0: $lang already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang $lang
-    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-  fi
-fi
-
-if [ $stage -le 17 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 18 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.  The num-leaves is always somewhat less than the num-leaves from
-  # the GMM baseline.
-   if [ -f $tree_dir/final.mdl ]; then
-     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-     exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor 3 \
-    --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
-    $lang $ali_dir $tree_dir
-fi
-
-
-if [ $stage -le 19 ]; then
-  mkdir -p $dir
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
-  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
-  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
-  prefinal_opts="l2-regularize=0.01"
-  output_opts="l2-regularize=0.005"
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024
-  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
-  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
-  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
-  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
-  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
-  linear-component name=prefinal-l dim=192 $linear_opts
-
-
-  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-
-if [ $stage -le 20 ]; then
-  if [[ $(hostname -f) == *.clsp.joujhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage=$train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --trainer.dropout-schedule $dropout_schedule \
-    --trainer.srand $srand \
-    --trainer.max-param-change 2.0 \
-    --trainer.num-epochs 4 \
-    --trainer.frames-per-iter 5000000 \
-    --trainer.optimization.num-jobs-initial 1 \
-    --trainer.optimization.num-jobs-final=2 \
-    --trainer.optimization.initial-effective-lrate 0.0005 \
-    --trainer.optimization.final-effective-lrate 0.00005 \
-    --trainer.num-chunk-per-minibatch 128,64 \
-    --trainer.optimization.momentum 0.0 \
-    --egs.chunk-width $chunk_width \
-    --egs.chunk-left-context 0 \
-    --egs.chunk-right-context 0 \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs $remove_egs \
-    --use-gpu true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir exp/tri5a_lats_nodup_sp \
-    --dir $dir  || exit 1;
-fi
-
-if [ $stage -le 21 ]; then
-  # The reason we are using data/lang_test here, instead of $lang, is just to
-  # emphasize that it's not actually important to give mkgraph.sh the
-  # lang directory with the matched topology (since it gets the
-  # topology file from the model).  So you could give it a different
-  # lang directory, one that contained a wordlist and LM of your choice,
-  # as long as phones.txt was compatible.
-  #LM was trained only on Fisher Spanish train subset.
-
-  utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
-    $tree_dir $tree_dir/graph_fsp_train || exit 1;
-
-fi
-
-# Let's train first a small RNNLM on Fisher train set
-rnnlmdir=exp/rnnlm_lstm_tdnn_1b
-if [ $stage -le 22 ]; then
-  rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
-fi
-
-if [ $stage -le 23 ]; then
-  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  rm $dir/.error 2>/dev/null || true
-
-  for data in $test_sets; do
-    (
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
-      for lmtype  in fsp_train; do
-        steps/nnet3/decode.sh \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context 0 --extra-right-context 0 \
-          --extra-left-context-initial 0 \
-          --extra-right-context-final 0 \
-          --frames-per-chunk $frames_per_chunk \
-          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
-          --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
-          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
-      done
-      if [ $gigaword_workdir ]; then
-        bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \
-              ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1;
-      fi
-      bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
-	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
deleted file mode 100644
index 7d92eb9fe3a..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_abbrevs_text.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-#
-#    2018  Saikiran Valluri, GoVivace inc.,
-
-import os, sys
-import re
-import codecs
-
-if len(sys.argv) < 3:
-  print("Usage : python clean_abbrevs_text.py <Input text> <output text>")
-  print("        Processes the text before text normalisation to convert uppercase words as space separated letters")
-  sys.exit()
-
-inputfile=codecs.open(sys.argv[1], encoding='utf-8')
-outputfile=codecs.open(sys.argv[2], encoding='utf-8', mode='w')
-
-for line in inputfile:
-  words = line.split()
-  textout = ""
-  wordcnt = 0
-  for word in words:
-    if re.match(r"\b([A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ])+[']?s?\b", word):
-      if wordcnt > 0:
-        word = re.sub('\'?s', 's', word)
-        textout = textout + " ".join(word) + " "
-      else:
-        textout = textout + word + " "
-    else:
-      textout = textout + word + " "
-      if word.isalpha(): wordcnt = wordcnt + 1
-  outputfile.write(textout.strip()+ '\n')
-
-inputfile.close()
-outputfile.close() 
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
deleted file mode 100755
index 1880b3a90cb..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/clean_txt_dir.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/bin/bash
-
-# Script to clean up gigaword LM text
-# Removes punctuations, does case normalization
-
-stage=0
-nj=500
-
-. ./path.sh
-. ./cmd.sh
-. ./utils/parse_options.sh
-
-set -euo pipefail
-
-if [ $# -ne 2 ]; then
-    echo "Usage: $0 <textdir> <outdir>"
-    exit 1;
-fi
-
-if [ ! -s `which normalizer_main` ] ; then
-  echo "Sparrowhawk normalizer was not found installed !"
-  echo "Go to $KALDI_ROOT/tools and execute install_sparrowhawk.sh and try again!"
-  exit 1
-fi
-
-txtdir=$1
-textdir=$(realpath $txtdir)
-outdir=$(realpath $2)
-
-workdir=$outdir/tmp
-if [ $stage -le 0 ]; then
-  rm -rf $outdir
-  mkdir -p $workdir
-  mkdir -p $textdir/splits
-  mkdir -p $outdir/data
-  split -l 1000000 $textdir/in.txt $textdir/splits/out
-  numsplits=0
-  for x in $textdir/splits/*; do
-    numsplits=$((numsplits+1))
-    ln -s $x $outdir/data/$numsplits
-  done
-  echo $numsplits
-  cp $SPARROWHAWK_ROOT/documentation/grammars/sentence_boundary_exceptions.txt .
-  $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
-    local/run_norm.sh \
-    sparrowhawk_configuration.ascii_proto \
-    $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \
-    $outdir/data \
-    JOB \
-    $outdir/sparrowhawk/
-  cat $outdir/sparrowhawk/*.txt | sed "/^$/d"  > $outdir/text_normalized
-
-  # check if numbers are there in normalized output
-  awk '{for(i=1;i<=NF;i++) {if (!seen[$i]) {print $i; seen[$i]=1} }}' \
-    $outdir/text_normalized > $outdir/unique_words
-  grep "[0-9]" $outdir/unique_words | sort -u >  $outdir/numbers
-fi
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh
deleted file mode 100755
index d48a96db5c4..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/create_oracle_ctm.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-# No sanity checks here, they need to be added
-
-data=data/callhome_test
-dir=exp/tri5a/decode_callhome_test
-lang=data/lang
-LMWT=13
-
-[ -f ./path.sh ] && . ./path.sh
-
-cmd=run.pl
-filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
-name=`basename $data`;
-model=$dir/../final.mdl # assume model one level up from decoding dir.
-symTable=$lang/words.txt
-
-if [ ! -f $dir/oracle/oracle.lat.gz ]; then
-    cat $data/text | utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \
-        lattice-oracle --write-lattices="ark:|gzip -c > $dir/oracle/oracle.lat.gz" \
-            "ark:gunzip -c $dir/lat.*.gz|" ark:- ark:- > /dev/null 2>&1
-fi
-
-lattice-align-words $lang/phones/word_boundary.int $model \
-    "ark:gunzip -c $dir/oracle/oracle.lat.gz|" ark:- | \
-    lattice-1best --lm-scale=$LMWT ark:- ark:- | nbest-to-ctm ark:- - | \
-    utils/int2sym.pl -f 5 $lang/words.txt | \
-    utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
-        > $dir/oracle/$name.ctm
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh
deleted file mode 100755
index 8a60dc9d422..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/create_splits.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-data_dir=data
-train_all=data/train_all
-
-if [ $# -lt 1 ]; then
-    echo "Specify the location of the split files"
-    exit 1;
-fi
-
-splitFile=$1
-
-# Train first
-for split in train dev test dev2
-do
-
-  cp -r $train_all $data_dir/$split
-
-  awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \
-  $splitFile/$split $train_all/segments > $data_dir/$split/segments
-
-  n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $data_dir/$split/segments | sort | uniq | wc -l`
-
-  echo "$n conversations left in split $split"
-
-  utils/fix_data_dir.sh $data_dir/$split
-  utils/validate_data_dir.sh $data_dir/$split
-done
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh
deleted file mode 100755
index 7d09f574580..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/ctm.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-. ./cmd.sh
-
-split=test
-data_dir=data/test
-decode_dir=exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it4/
-lang_dir=data/lang
-
-# Create the STM file
-# Always create this file before creating the CTM files so that
-# channel numbers are properly created.
-if [ ! -f $data_dir/stm ]; then
-    /export/a11/guoguo/babel/103-bengali-limitedLP.official/local/prepare_stm.pl $data_dir
-fi
-
-# Create the CTM file
-steps/get_ctm.sh $data_dir $lang_dir $decode_dir
-
-# Make sure that channel markers match
-#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
-
-# Get the environment variables
-. /export/babel/data/software/env.sh
-
-# Start scoring
-/export/a11/guoguo/babel/103-bengali-limitedLP.official/local/score_stm.sh $data_dir $lang_dir \
-    $decode_dir
-
-# Print a summary of the result
-grep "Percent Total Error" $decode_dir/score_*/$split.ctm.dtl
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py b/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py
deleted file mode 100755
index 6f3d3f80c95..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/decode_report.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env python
-
-# Author : Gaurav Kumar (Johns Hopkins University)
-# Gets a report on what the best word error rate was and which iteration
-# led to it. This is needed both for reporting purposes and for setting
-# the acoustic scale weight which extracting lattices.
-# This script is specific to my partitions and needs to be made more general
-# or modified
-
-from __future__ import print_function
-import subprocess
-import os
-
-decode_directories = ['exp/tri5a/decode_dev',
-                        'exp/tri5a/decode_test',
-                        'exp/tri5a/decode_dev2',
-                        'exp/sgmm2x_6a/decode_dev_fmllr',
-                        'exp/sgmm2x_6a/decode_test_fmllr',
-                        'exp/sgmm2x_6a/decode_dev2_fmllr',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it1',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it2',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it3',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_it4',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it1',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it2',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it3',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_it4',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_it1',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_it2',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_it3',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_it4',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it1',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it2',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it3',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it4',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it1',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it2',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it3',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it4',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it1',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it2',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it3',
-                        'exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it4'
-                        ]
-
-def get_best_wer(decode_dir):
-    best_iteration = 0
-    best_wer = 100.0
-    for i in range(16):
-        if os.path.isfile("{}/wer_{}".format(decode_dir, i)):
-            result = subprocess.check_output("tail -n 3 {}/wer_{}".format(decode_dir, i), shell=True)
-            wer_string = result.split("\n")[0]
-            wer_details = wer_string.split(' ')
-            # Get max WER
-            wer = float(wer_details[1])
-            if wer < best_wer:
-                best_wer = wer
-                best_iteration = i
-    return best_iteration, best_wer
-
-for decode_dir in decode_directories[:6]:
-    print(decode_dir)
-    print(get_best_wer(decode_dir))
-
-# Separate processing for bMMI stuff
-best_wer = 100.0
-best_dir = ""
-best_iteration = 0
-
-for decode_dir in decode_directories[6:10]:
-    iteration, wer = get_best_wer(decode_dir)
-    if wer < best_wer:
-        best_wer = wer
-        best_dir = decode_dir
-        best_iteration = iteration
-
-print(best_dir)
-print((best_iteration, best_wer))
-
-best_wer = 100.0
-best_dir = ""
-best_iteration = 0
-
-for decode_dir in decode_directories[10:14]:
-    iteration, wer = get_best_wer(decode_dir)
-    if wer < best_wer:
-        best_wer = wer
-        best_dir = decode_dir
-        best_iteration = iteration
-
-print(best_dir)
-print((best_iteration, best_wer))
-
-best_wer = 100.0
-best_dir = ""
-best_iteration = 0
-
-for decode_dir in decode_directories[14:18]:
-    iteration, wer = get_best_wer(decode_dir)
-    if wer < best_wer:
-        best_wer = wer
-        best_dir = decode_dir
-        best_iteration = iteration
-
-print(best_dir)
-print((best_iteration, best_wer))
-
-best_wer = 100.0
-best_dir = ""
-best_iteration = 0
-
-for decode_dir in decode_directories[18:22]:
-    iteration, wer = get_best_wer(decode_dir)
-    if wer <= best_wer:
-        best_wer = wer
-        best_dir = decode_dir
-        best_iteration = iteration
-
-print(best_dir)
-print((best_iteration, best_wer))
-
-best_wer = 100.0
-best_dir = ""
-best_iteration = 0
-
-for decode_dir in decode_directories[22:26]:
-    iteration, wer = get_best_wer(decode_dir)
-    if wer <= best_wer:
-        best_wer = wer
-        best_dir = decode_dir
-        best_iteration = iteration
-
-print(best_dir)
-print((best_iteration, best_wer))
-
-best_wer = 100.0
-best_dir = ""
-best_iteration = 0
-
-for decode_dir in decode_directories[26:]:
-    iteration, wer = get_best_wer(decode_dir)
-    if wer <= best_wer:
-        best_wer = wer
-        best_dir = decode_dir
-        best_iteration = iteration
-
-print(best_dir)
-print((best_iteration, best_wer))
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl
deleted file mode 100755
index 2da41182d20..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/find_unique_phones.pl
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env perl
-#Finds unique phones from the basic rules file
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-use utf8;
-
-($b)=$ARGV[0];
-($tmpdir)=$ARGV[1];
-open(BB, "<", "$b/basic_rules") || die "Can't open basic rules";
-binmode(BB, ":iso88591");
-open(O, ">$tmpdir/phones") || die "Can't open text file for writing";
-binmode(O, ":utf8");
-my %phones = qw();
-while (<BB>) {
-  chomp;
-  my @stringComponents = split(/\t/);
-  m/->\s(\S+)/;
-  my $phone = $1;
-  $phone =~ tr/áéíóú/aeiou/;
-  $phones{$phone} = 1;
-}
-foreach my $p (keys %phones) {
-  print O $p, "\n";
-}
-#print keys %phones;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh
deleted file mode 100755
index 20220d107bc..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/fix_stm.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-
-# Fixes the CALLHOME stm files
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-data_dir=$1
-
-cat $data_dir/stm | awk '{$1=substr(tolower($1),0,length($1)-4);print;}' > $data_dir/stm_new
-mv $data_dir/stm $data_dir/stm.bak
-mv $data_dir/stm_new $data_dir/stm
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh
deleted file mode 100755
index 242359e7c28..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_all_gigaword.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-# Path to Gigaword corpus with all data files decompressed.
-export GIGAWORDDIR=$1
-# The directory to write output to
-export OUTPUTDIR=$2
-# The number of jobs to run at once
-export NUMJOBS=$3
-
-echo "Flattening Gigaword with ${NUMJOBS} processes..."
-mkdir -p $OUTPUTDIR
-find ${GIGAWORDDIR}/data/*/* -type f -print -exec local/flatten_gigaword/run_flat.sh {} ${OUTPUTDIR} \;
-echo "Combining the flattened files into one..."
-cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py
deleted file mode 100644
index 29f6766dd84..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/flatten_one_gigaword.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import logging
-import os
-import re
-import spacy
-import gzip
-
-from argparse import ArgumentParser
-from bs4 import BeautifulSoup
-
-en_nlp = spacy.load("es")
-
-
-def flatten_one_gigaword_file(file_path):
-    f = gzip.open(file_path)
-    html = f.read()
-    # Parse the text with BeautifulSoup
-    soup = BeautifulSoup(html, "html.parser")
-
-    # Iterate over all <p> items and get the text for each.
-    all_paragraphs = []
-    for paragraph in soup("p"):
-        # Turn inter-paragraph newlines into spaces
-        paragraph = paragraph.get_text()
-        paragraph = re.sub(r"\n+", "\n", paragraph)
-        paragraph = paragraph.replace("\n", " ")
-        # Tokenize the paragraph into words
-        tokens = en_nlp.tokenizer(paragraph)
-        words = [str(token) for token in tokens if not
-                 str(token).isspace()]
-        if len(words) < 3:
-            continue
-        all_paragraphs.append(words)
-    # Return a list of strings, where each string is a
-    # space-tokenized paragraph.
-    return [" ".join(paragraph) for paragraph in all_paragraphs]
-
-
-if __name__ == "__main__":
-    log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-    logging.basicConfig(level=logging.INFO, format=log_fmt)
-    logger = logging.getLogger(__name__)
-
-    parser = ArgumentParser(description=("Flatten a gigaword data file for "
-                                         "use in language modeling."))
-    parser.add_argument("--gigaword-path", required=True,
-                        metavar="<gigaword_path>", type=str,
-                        help=("Path to Gigaword directory, with "
-                              "all .gz files unzipped."))
-    parser.add_argument("--output-dir", required=True, metavar="<output_dir>",
-                        type=str, help=("Directory to write final flattened "
-                                        "Gigaword file."))
-
-    A = parser.parse_args()
-    all_paragraphs = flatten_one_gigaword_file(A.gigaword_path)
-    output_path = os.path.join(A.output_dir,
-                               os.path.basename(A.gigaword_path) + ".flat")
-    with open(output_path, "w") as output_file:
-        for paragraph in all_paragraphs:
-            output_file.write("{}\n".format(paragraph))
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh
deleted file mode 100755
index 6b236be0ab9..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/flatten_gigaword/run_flat.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-. ./path_venv.sh
-
-# Path to Gigaword corpus with all data files decompressed.
-GIGAWORDPATH=$1
-# The directory to write output to
-OUTPUTDIR=$2
-file=$(basename ${GIGAWORDPATH})
-if [ ! -e ${OUTPUTDIR}/${file}.flat ]; then
-    echo "flattening to ${OUTPUTDIR}/${file}.flat"
-    python local/flatten_gigaword/flatten_one_gigaword.py --gigaword-path ${GIGAWORDPATH} --output-dir ${OUTPUTDIR}
-else
-    echo "skipping ${file}.flat"
-fi
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh
deleted file mode 100755
index fb765b57e69..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_create_test_lang.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-#
-
-if [ -f path.sh ]; then . ./path.sh; fi
-
-mkdir -p data/lang_test
-
-arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
-[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
-
-mkdir -p data/lang_test
-cp -r data/lang/* data/lang_test
-
-gunzip -c "$arpa_lm" | \
-  arpa2fst --disambig-symbol=#0 \
-           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
-
-
-echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst
-
-## Check lexicon.
-## just have a look and make sure it seems sane.
-echo "First few lines of lexicon FST:"
-fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
-
-echo Performing further checks
-
-# Checking that G.fst is determinizable.
-fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
-
-# Checking that L_disambig.fst is determinizable.
-fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
-
-# Checking that disambiguated lexicon times G is determinizable
-# Note: we do this with fstdeterminizestar not fstdeterminize, as
-# fstdeterminize was taking forever (presumbaly relates to a bug
-# in this version of OpenFst that makes determinization slow for
-# some case).
-fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
-   fstdeterminizestar >/dev/null || echo Error
-
-# Checking that LG is stochastic:
-fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
-   fstisstochastic || echo "[log:] LG is not stochastic"
-
-
-echo "$0 succeeded"
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh
deleted file mode 100755
index 22b98a6c9db..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_data_prep.sh
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files)
-# In addition the transcripts are needed as well.
-# To be run from one directory above this script.
-
-# Note: when creating your own data preparation scripts, it's a good idea
-# to make sure that the speaker id (if present) is a prefix of the utterance
-# id, that the output scp file is sorted on utterance id, and that the
-# transcription file is exactly the same length as the scp file and is also
-# sorted on utterance id (missing transcriptions should be removed from the
-# scp file using e.g. scripts/filter_scp.pl)
-
-stage=0
-
-export LC_ALL=C
-
-
-if [ $# -lt 2 ]; then
-   echo "Usage: $0 <LDC2010S01-location> <LDC2010T04-location>"
-   echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04"
-   exit 1;
-fi
-
-cdir=`pwd`
-dir=`pwd`/data/local/data
-lmdir=`pwd`/data/local/nist_lm
-mkdir -p $dir $lmdir
-local=`pwd`/local
-utils=`pwd`/utils
-tmpdir=`pwd`/data/local/tmp
-mkdir -p $tmpdir
-
-. ./path.sh || exit 1; # Needed for KALDI_ROOT
-export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
-if [ ! -x $sph2pipe ]; then
-   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
-   exit 1;
-fi
-cd $dir
-
-# Make directory of links to the WSJ disks such as 11-13.1.  This relies on the command
-# line arguments being absolute pathnames.
-rm -r links/ 2>/dev/null
-mkdir links/
-ln -s $* links
-
-# Basic spot checks to see if we got the data that we needed
-if [ ! -d links/LDC2010S01 -o ! -d links/LDC2010T04 ];
-then
-        echo "The speech and the data directories need to be named LDC2010S01 and LDC2010T04 respecti
-vely"
-        exit 1;
-fi
-
-#if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ];
-if [ ! -d links/LDC2010S01/data/speech ];
-then
-        echo "Speech directories missing or not properly organised within the speech data dir"
-        echo "Typical format is LDC2010S01/data/speech"
-        exit 1;
-fi
-
-#Check the transcripts directories as well to see if they exist
-if [ ! -d links/LDC2010T04/fisher_spa_tr/data/transcripts ];
-then
-        echo "Transcript directories missing or not properly organised"
-        echo "Typical format is LDC2010T04/fisher_spa_tr/data/transcripts"
-        exit 1;
-fi
-
-#speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
-#speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
-speech=$dir/links/LDC2010S01/data/speech
-transcripts=$dir/links/LDC2010T04/fisher_spa_tr/data/transcripts
-
-#fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`
-#fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`
-fcount_s=`find ${speech} -iname '*.sph' | wc -l`
-fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`
-#TODO:it seems like not all speech files have transcripts
-#Now check if we got all the files that we needed
-#if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];
-if [ $fcount_s != 819 -o $fcount_t != 819 ];
-then
-        echo "Incorrect number of files in the data directories"
-        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively (Total = 819)"
-        echo "The transcripts should contain 819 files"
-        exit 1;
-fi
-
-if [ $stage -le 0 ]; then
-  #Gather all the speech files together to create a file list
-  #TODO: Train and test split might be required
-  (
-    #find $speech_d1 -iname '*.sph';
-    #find $speech_d2 -iname '*.sph';
-    find $speech -iname '*.sph';
-  ) > $tmpdir/train_sph.flist
-
-  #Get all the transcripts in one place
-  find $transcripts -iname '*.tdf' > $tmpdir/train_transcripts.flist
-fi
-
-if [ $stage -le 1 ]; then
-  $local/fsp_make_trans.pl $tmpdir
-  mkdir -p $dir/train_all
-  mv $tmpdir/reco2file_and_channel $dir/train_all/
-fi
-
-if [ $stage -le 2 ]; then
-  sort $tmpdir/text.1 | grep -v '((' | \
-  awk '{if (NF > 1){ print; }}' | \
-  sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \
-  sed 's:<lname>\([^<]*\)<\/lname>:\1:g' | \
-  sed 's:<lname[\/]*>::g' | \
-  sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
-  sed 's:<\s*cough[\/]*>:[noise]:g' | \
-  sed 's:<sneeze[\/]*>:[noise]:g' | \
-  sed 's:<breath[\/]*>:[noise]:g' | \
-  sed 's:<lipsmack[\/]*>:[noise]:g' | \
-  sed 's:<background>[^<]*<\/background>:[noise]:g' | \
-  sed -r 's:<[/]?background[/]?>:[noise]:g' | \
-  #One more time to take care of nested stuff
-  sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
-  sed -r 's:<[/]?laugh[/]?>:[laughter]:g' | \
-  #now handle the exceptions, find a cleaner way to do this?
-  sed 's:<foreign langenglish::g' | \
-  sed 's:</foreign::g' | \
-  sed -r 's:<[/]?foreing\s*\w*>::g' | \
-  sed 's:</b::g' | \
-  sed 's:<foreign langengullís>::g' | \
-  sed 's:foreign>::g' | \
-  sed 's:\[noise\]:[noise] :g' | \
-  sed 's:>::g' | \
-  #How do you handle numbers?
-  grep -v '()' | \
-  #Now go after the non-printable characters and multiple spaces
-  sed -r 's:¿::g'  | sed 's/^\s\s*|\s\s*$//g' | sed 's/\s\s*/ /g' > $tmpdir/text.2
-  cp $tmpdir/text.2 $dir/train_all/text
-
-  #Create segments file and utt2spk file
-  ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \
-  && echo "Error producing utt2spk file" && exit 1;
-
-  cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2;
-  $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); if ($s != $e) {print "$utt $reco $s $e\n"}; ' >$dir/train_all/segments
-
-  $utils/utt2spk_to_spk2utt.pl <$dir/train_all/utt2spk > $dir/train_all/spk2utt
-fi
-
-if [ $stage -le 3 ]; then
-  for f in `cat $tmpdir/train_sph.flist`; do
-    # convert to absolute path
-    make_absolute.sh $f
-  done > $tmpdir/train_sph_abs.flist
-
-  cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; ";  print "$1 $_"; ' > $tmpdir/sph.scp
-  cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
-  sort -k1,1 -u  > $dir/train_all/wav.scp || exit 1;
-fi
-
-if [ $stage -le 4 ]; then
-  # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl.
-  cd $cdir
-  $local/fsp_make_spk2gender.sh > $dir/train_all/spk2gender
-fi
-
-fix_data_dir.sh $dir/train_all || exit 1
-validate_data_dir.sh --no-feats $dir/train_all || exit 1
-
-echo "Fisher Spanish Data preparation succeeded."
-
-exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl
deleted file mode 100755
index 538bca58981..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_ideal_data_partitions.pl
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env perl
-#
-# Johns Hopkins University (Author : Gaurav Kumar)
-#
-# This script should be run from one directory above the current one
-#
-# Rough partitions that are needed are :
-#
-# ASR Train : 120k utterances
-# ASR tune : 20k utterances
-# ASR eval : 20k utterances
-# MT train : 105k utterances
-# MT tune : Same as the ASR eval (20k utterances)
-# MT eval : 20k utterances
-#
-# This script tries to find the closest possible matches so that conversations
-# belong in one single partition and hence there is no speaker/conversation
-# overlap between data partitions
-
-use Storable 'dclone';
-
-$textfile="data/local/data/train_all/text";
-$tmp="data/local/tmp";
-
-open(T, "<", "$textfile") || die "Can't open text file";
-
-$ongoingConv = "";
-%tmpSplits = ();
-@splitNumbers = (17455, 20000, 100000, 20000, 100000);
-$splitId = 0;
-%splits = ();
-
-while (<T>) {
-   @myStringComponents = split(/\s/);
-  @uttid = split('-', $myStringComponents[0]);
-  $currentConv = $uttid[0];
-  if ($currentConv eq $ongoingConv) {
-    # Same conversation, add to current hash
-    #print "Same conversation";
-    $tmpSplits{$ongoingConv} += 1;
-  }
-  else {
-    # New conversation intiated, first check if there are enough entries
-    # in the hash
-    #print $ongoingConv . " " . get_entries_hash(\%tmpSplits) . "\n";
-    if (get_entries_hash(\%tmpSplits) > $splitNumbers[$splitId]) {
-      print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n";
-      #$splits{$splitId} = keys %tmpSplits;
-      @newArr = keys %tmpSplits;
-      $splits{$splitId} = dclone(\@newArr);
-      %tmpSplits = ();
-      $splitId += 1;
-    }
-    $ongoingConv = $currentConv;
-    $tmpSplits{$ongoingConv} = 1;
-  }
-}
-# Put final tmpsplits in the right partition
-@newArr = keys %tmpSplits;
-$splits{$splitId} = dclone(\@newArr);
-foreach (keys  %splits) {
-  #print $_ , " ", $splits{$_}, "\n";
-}
-print "Finished processing split " . $splitId . ". It contains " . get_entries_hash(\%tmpSplits) . " entries. \n";
-
-# Write splits to file
-foreach my $key ( keys %splits ) {
-  open(S, ">$tmp/split-$key") || die "Can't open splitfile to write";
-  foreach my $file ( @{$splits{$key}} ) {
-    print $file, "\n";
-    print S "$file\n" || die "Error writing to file";
-  }
-  close(S);
-}
-
-sub get_entries_hash() {
-  my $inputHashRef = shift;
-  $total = 0;
-  foreach (keys %{$inputHashRef})
-    {
-    $total += $inputHashRef->{$_};
-    }
-  return $total;
-}
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh
deleted file mode 100755
index 15b1c0064cf..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_spk2gender.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-# Gets the unique speakers from the file created by fsp_make_trans.pl
-# Note that if a speaker appears multiple times, it is categorized as female
-
-import os
-import sys
-
-tmpFileLocation = 'data/local/tmp/spk2gendertmp'
-
-tmpFile = None
-
-try:
-     tmpFile = open(tmpFileLocation)
-except IOError:
-    print 'The file spk2gendertmp does not exist. Run fsp_make_trans.pl first?'
-
-speakers = {}
-
-for line in tmpFile:
-    comp = line.split(' ')
-    if comp[0] in speakers:
-        speakers[comp[0]] = "f"
-    else:
-        speakers[comp[0]] = comp[1]
-
-for speaker, gender in speakers.iteritems():
-    print speaker + " " + gender
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl
deleted file mode 100755
index 8c3f74e3917..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_make_trans.pl
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-use utf8;
-use File::Basename;
-($tmpdir)=@ARGV;
-#$tmpdir='../data/local/tmp';
-$trans="$tmpdir/train_transcripts.flist";
-$reco="$tmpdir/reco2file_and_channel";
-open(T, "<", "$trans") || die "Can't open transcripts file";
-open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!";
-open(O, ">$tmpdir/text.1") || die "Can't open text file for writing";
-open(G, ">$tmpdir/spk2gendertmp") || die "Can't open the speaker to gender map file";
-binmode(O, ":utf8");
-while (<T>) {
-  $file = $_;
-  m:([^/]+)\.tdf: || die "Bad filename $_";
-  $call_id = $1;
-  print R "$call_id-A $call_id A\n";
-  print R "$call_id-B $call_id B\n";
-  open(I, "<$file") || die "Opening file $_";
-  binmode(I, ":utf8");
-  # Get rid of header sections first
-  foreach ( 0..2 ) {
-    $tmpLine = <I>;
-  }
-  #Now read each line and extract information
-  while (<I>) {
-    #20051017_215732_274_fsp.sph     1       0.0     0.909856781803  Audrey  female  native   <foreign lang="English"> Audrey </foreign>     0       0       -1
-    chomp;
-    my @stringComponents = split(/\t/);
-
-    #Check number of components in this array
-    if ((scalar @stringComponents) >= 11) {
-      $start = sprintf("%06d", $stringComponents[2] * 100);
-      $end = sprintf("%06d", $stringComponents[3] * 100);
-      length($end) > 6 && die "Time too long $end in $file";
-      $side = $stringComponents[1] ? "B" : "A";
-      $words = $stringComponents[7];
-      $utt_id = "${call_id}-$side-$start-$end";
-      $speaker_id = "${call_id}-$side";
-      $gender = "m";
-      if  ($stringComponents[5] == "female") {
-        $gender = "f";
-      }
-      print G "$speaker_id $gender\n" || die "Error writing to speaker2gender file";
-      $words =~ s:</:lendarrow:g;
-      $words =~ s/</larrow/g;
-      $words =~ s/>/rarrow/g;
-      $words =~ s/[[:punct:]]//g;
-      $words =~ s/larrow/</g;
-      $words =~ s/rarrow/>/g;
-      $words =~ s:lendarrow:</:g;
-      $words =~ s/Á/á/g;
-      $words =~ s/Í/í/g;
-      $words =~ s/Ó/ó/g;
-      $words =~ s/Ú/ú/g;
-#      $words =~ s/ì/í/g;
-#      $words =~ s/è/é/g;
-#      $words =~ s/¡/i/g;
-#      $words =~ s/J/J/g;
-#      $words =~ s/S/S/g;
-#      $words =~ s/à/á/g;
-      $words =~ s/¨//g;
-      $words =~ s/·//g;
-      $words =~ s/´//g;
-      $words =~ s/N/n/g;
-#      $words =~ s/2//g;
-      $words = lc($words);
-#      $words =~ s:ü([eiéí]):w\1:g;
-#      $words =~ s:ü:u:g;
-#      $words =~ s:ñ:N:g;
-      print O "$utt_id $words\n" || die "Error writing to text file";
-    }
-  }
-  close(I)
-}
-close(T);
-close(R);
-close(O);
-close(G);
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh
deleted file mode 100755
index 7b2de2db392..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_prepare_dict.sh
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-. ./path.sh
-
-#First get the list of unique words from our text file
-if [ $# -lt 1 ]; then
-  echo 'Usage fsp_prepare_dict.sh lexicon'
-  exit 1;
-fi
-
-stage=0
-
-dir=`pwd`/data/local/dict
-datadir=`pwd`/data/local/data/train_all
-mkdir -p $dir
-local=`pwd`/local
-utils=`pwd`/utils
-tmpdir=`pwd`/data/local/tmp
-lexicon=$1
-
-#Get all unique words, remove punctuation.
-if [ $stage -le 0 ]; then
-  cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords
-  if [ ! -f "${tmpdir}/es_wordlist.json" ]; then
-    echo "Could not find the large collection of Spanish words es_wordlist.json"
-    echo "Trying to download it via wget"
-
-    if ! which wget >&/dev/null; then
-      echo "This script requires you to first install wget"
-      exit 1;
-    fi
-
-    cwd=`pwd`
-    cd $tmpdir
-    wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz
-
-    if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then
-      echo "Download of the large Spanish word list failed"
-      exit 1;
-    fi
-
-    tar -xovzf es_wordlist.json.tgz || exit 1;
-    cd $cwd
-  fi
-
-  # Merge with gigaword corpus
-  $local/merge_lexicons.py ${tmpdir} ${lexicon}
-  mv $tmpdir/uniquewords $tmpdir/uniquewords.small
-  mv $tmpdir/uniquewords64k $tmpdir/uniquewords
-fi
-
-#Then get the list of phones form basic_rules in the lexicon folder
-if [ $stage -le 1 ]; then
-  if [ ! -d "$lexicon/callhome_spanish_lexicon_970908" ]; then
-    echo "Could not find folder callhome_spanish_lexicon_970908 in the lexicon folder"
-    exit 1;
-  fi
-
-  # This is a preliminary attempt to get the unique phones from the LDC lexicon
-  # This will be extended based on our lexicon later
-  perl $local/find_unique_phones.pl $lexicon/callhome_spanish_lexicon_970908 $tmpdir
-
-fi
-
-#Get pronunciation for each word using the spron.pl file in the lexicon folder
-if [ $stage -le 2 ]; then
-  #cd $lexicon/callhome_spanish_lexicon_970908
-  # Replace all words for which no pronunciation was generated with an orthographic
-  # representation
-  cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \
-    | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \
-    | awk -F '[/][/]' '{print $1}' \
-    > $tmpdir/lexicon_raw
-fi
-
-#Break the pronunciation down according to the format required by Kaldi
-if [ $stage -le 3 ]; then
-  # Creates a KALDI compatible lexicon, and extends the phone list
-  perl $local/isolate_phones.pl $tmpdir
-  cat $tmpdir/phones_extended | sort | awk '{if ($1 != "") {print;}}' > $tmpdir/phones_extended.1
-  mv $tmpdir/phones $tmpdir/phones.small
-  mv $tmpdir/phones_extended.1 $tmpdir/phones
-  sort $tmpdir/phones -o $tmpdir/phones
-  paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | sed -r 's:(\S+)\s#.*:\1 oov:g' > $tmpdir/lexicon.1
-  #paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | grep -v '#' > $tmpdir/lexicon.1
-fi
-
-if [ $stage -le 4 ]; then
-  # silence phones, one per line.
-  for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt
-  echo sil > $dir/optional_silence.txt
-
-  # An extra question will be added by including the silence phones in one class.
-  cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > \
-  $dir/extra_questions.txt || exit 1;
-
-  # Remove [] chars from phones
-  cat $tmpdir/phones | awk '{if ($1 != "_" && $1 != "[" && $1 != "]") {print;}}' > $tmpdir/phones.1
-  rm $tmpdir/phones
-  mv $tmpdir/phones.1 $tmpdir/phones
-  cp $tmpdir/phones $dir/nonsilence_phones.txt
-
-  if [ -f $tmpdir/lexicon.2 ]; then rm $tmpdir/lexicon.2; fi
-  cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
-
-  # Add prons for laughter, noise, oov
-  for w in `grep -v sil $dir/silence_phones.txt`; do
-    sed -i "/\[$w\]/d" $tmpdir/lexicon.2
-  done
-
-  for w in `grep -v sil $dir/silence_phones.txt`; do
-    echo "[$w] $w"
-  done | cat - $tmpdir/lexicon.2  > $tmpdir/lexicon.3 || exit 1;
-
-  cat $tmpdir/lexicon.3  \
-   <( echo "mm m"
-      echo "<unk> oov" ) > $tmpdir/lexicon.4
-
-  # From the lexicon remove _ from the phonetic representation
-  cat $tmpdir/lexicon.4 | sed 's:\s_::g' > $tmpdir/lexicon.5
-
-  cp "$tmpdir/lexicon.5" $dir/lexicon.txt
-
-  cat $datadir/text  | \
-  awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
-  sort -nr > $tmpdir/word_counts
-
-  awk '{print $1}' $dir/lexicon.txt | \
-  perl -e '($word_counts)=@ARGV;
-   open(W, "<$word_counts")||die "opening word-counts $word_counts";
-   while(<STDIN>) { chop; $seen{$_}=1; }
-   while(<W>) {
-     ($c,$w) = split;
-     if (!defined $seen{$w}) { print; }
-   } ' $tmpdir/word_counts > $tmpdir/oov_counts.txt
-  echo "*Highest-count OOVs are:"
-  head -n 20 $tmpdir/oov_counts.txt
-fi
-
-$utils/validate_dict_dir.pl $dir
-exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh
deleted file mode 100755
index cebf3b222ab..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/fsp_train_lms.sh
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/bin/bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-# To be run from one level above this directory
-# Generate the text for the LM training
-tmp_dir=data/local/tmp
-train_all=data/local/data/train_all
-
-if [ $# -lt 1 ]; then
-  echo "Specify the location of the split files"
-  exit 1;
-fi
-
-splitFile=$1
-split=train
-# Train only
-if [ -d $tmp_dir/$split ]; then
-  rm -r $tmp_dir/$split
-fi
-cp -r $train_all $tmp_dir/$split
-
-awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \
-$splitFile/$split $train_all/segments > $tmp_dir/$split/segments
-
-n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $tmp_dir/$split/segments | sort | uniq | wc -l`
-
-echo "$n conversations left in split $split"
-
-utils/fix_data_dir.sh $tmp_dir/$split
-# There is no feature file yet, use --no-feats switch
-utils/validate_data_dir.sh --no-feats $tmp_dir/$split
-
-# Now use this training text
-
-text=$tmp_dir/train/text
-lexicon=data/local/dict/lexicon.txt
-
-for f in "$text" "$lexicon"; do
-  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
-done
-
-# This script takes no arguments.  It assumes you have already run
-# fisher_data_prep.sh and fisher_prepare_dict.sh
-# It takes as input the files
-#data/train_all/text
-#data/local/dict/lexicon.txt
-
-dir=`pwd`/data/local/lm
-mkdir -p $dir
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd ../../../tools || exit 1;
- if [ -d kaldi_lm ]; then
-   echo Not installing the kaldi_lm toolkit since it is already there.
- else
-   echo Downloading and installing the kaldi_lm tools
-   if [ ! -f kaldi_lm.tar.gz ]; then
-     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
-   fi
-   tar -xvzf kaldi_lm.tar.gz || exit 1;
-   cd kaldi_lm
-   make || exit 1;
-   echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-mkdir -p $dir
-
-
-cleantext=$dir/text.no_oov
-
-cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
-  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<unk> ");} } printf("\n");}' \
-  > $cleantext || exit 1;
-
-
-cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
-   sort -nr > $dir/word.counts || exit 1;
-
-
-# Get counts from acoustic training transcripts, and add  one-count
-# for each word in the lexicon (but not silence, we don't want it
-# in the LM-- we'll add it optionally later).
-cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
-  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
-   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
-
-# note: we probably won't really make use of <unk> as there aren't any OOVs
-cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<unk>" > $dir/word_map \
-   || exit 1;
-
-# note: ignore 1st field of train.txt, it's the utterance-id.
-cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
-  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
-   || exit 1;
-
-train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
-
-# Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332
-
-# note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz
-
-
-exit 0
-
-echo "Baseline"
-
-# From here is some commands to do a baseline with SRILM (assuming
-# you have it installed).
-heldout_sent=158126 # Don't change this if you want result to be comparable with
-    # kaldi_lm results
-sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
-mkdir -p $sdir
-cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
-  head -$heldout_sent > $sdir/heldout
-cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
-  tail -n +$heldout_sent > $sdir/train
-
-cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
-
-
-ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
-  -map-unk "<unk>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
-ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
-
-# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
-# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
-# 0 zeroprobs, logprob= -165170 ppl= 71.7609 ppl1= 123.258
-
-
-# Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above.
-# Difference in WSJ must have been due to different treatment of <unk>.
-ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
-
-# data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
-# file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
-# 0 zeroprobs, logprob= -164990 ppl= 71.4278 ppl1= 122.614
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py
deleted file mode 100755
index 9c590635562..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_1_best.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-# Extracts one best output for a set of files
-# The list of files in the conversations for which 1 best output has to be extracted
-# words.txt
-
-import os
-import sys
-
-scoringFile = "exp/sgmm2x_6a_mmi_b0.2/decode_test_it4/scoring/10.tra"
-wordsFile = open('exp/sgmm2x_6a/graph/words.txt')
-conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/test')
-oneBestTmp = 'exp/sgmm2x_6a_mmi_b0.2/one-best/asr-test'
-provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.test', 'w+')
-timLocation = '/export/a04/gkumar/corpora/fishcall/fisher/tim'
-
-def findTranscription(timeDetail):
-  file1 = open(scoringFile)
-  for line in file1:
-    lineComp = line.split()
-    if lineComp[0] == timeDetail:
-      return " ".join(lineComp[1:])
-  # No result found
-  return -1
-
-words = {}
-
-# Extract word list
-for line in wordsFile:
-  lineComp = line.split()
-  words[int(lineComp[1])] = lineComp[0].strip()
-
-# Now read list of files in conversations
-fileList = []
-for line in conversationList:
-    line = line.strip()
-    line = line[:-4]
-    fileList.append(line)
-
-# Now get timing information to concatenate the ASR outputs
-if not os.path.exists(oneBestTmp):
-  os.makedirs(oneBestTmp)
-
-for item in fileList:
-  timingFile = open(timLocation + '/' + item + '.es')
-  newFile = open(oneBestTmp + '/' + item + '.es', 'w+')
-  for line in timingFile:
-    timeInfo = line.split()
-    mergedTranslation = ""
-    for timeDetail in timeInfo:
-      #Locate this in ASR dev/test, this is going to be very slow
-      tmp = findTranscription(timeDetail)
-      if tmp != -1:
-        mergedTranslation = mergedTranslation + " " + tmp
-    mergedTranslation = mergedTranslation.strip()
-    transWords = [words[int(x)] for x in mergedTranslation.split()]
-    newFile.write(" ".join(transWords) + "\n")
-    provFile.write(" ".join(transWords) + "\n")
-
-  newFile.close()
-provFile.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl
deleted file mode 100755
index ca5b2a46f8e..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_data_weights.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env perl
-
-# Nagendra Kumar Goel
-
-# This takes two arguments:
-# 1) Pocolm training output folder
-# 2) rnnlm weights file name (for output)
-
-use POSIX;
-use List::Util qw[min max];
-
-if (@ARGV != 2) {
-  die "Usage: get_data_weights.pl <pocolm-folder> <output-file>\n";
-}
-
-$pdir = shift @ARGV;
-$out = shift @ARGV;
-
-open(P, "<$pdir/metaparameters") || die "Could not open $pdir/metaparameters";
-open(N, "<$pdir/names") || die "Could not open $pdir/names"  ;
-open(O, ">$out")  || die "Could not open $out for writing" ;
-
-my %scores = ();
-
-while(<N>) {
-    @n = split(/\s/,$_);
-    $name = $n[1];
-    $w = <P>;
-    @w = split(/\s/,$w);
-    $weight = $w[1];
-    $scores{$name} = $weight;
-}
-
-$min = min(values %scores);
-
-for(keys %scores) {
-    $weightout = POSIX::ceil($scores{$_} / $min);
-    print O "$_\t1\t$weightout\n";
-}
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py
deleted file mode 100755
index 5430c18bb5b..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_lattices.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-# Extracts one best output for a set of files
-# The list of files in the conversations for which 1 best output has to be extracted
-# words.txt
-
-from __future__ import print_function
-import os
-import sys
-import subprocess
-
-latticeLocation = 'latjosh-bmmi/lattices-pushed/'
-
-tmpdir = 'data/local/data/tmp/bmmi-t/lattmp'
-invalidplfdir = 'data/local/data/tmp/bmmi-t/invalidplf'
-symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt'
-
-conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/test')
-provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/asr.test.plf', 'w+')
-invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/invalidPLF', 'w+')
-blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/blankPLF', 'w+')
-rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/removeLines', 'w+')
-
-if not os.path.exists(tmpdir):
-    os.makedirs(tmpdir)
-if not os.path.exists(invalidplfdir):
-    os.makedirs(invalidplfdir)
-else:
-    os.system("rm " + invalidplfdir + "/*")
-
-def latticeConcatenate(lat1, lat2):
-    '''
-    Concatenates lattices, writes temporary results to tmpdir
-    '''
-    if lat1 == "":
-        os.system('rm ' + tmpdir + '/tmp.lat')
-        return lat2
-    else:
-        proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')])
-        proc.wait()
-        return tmpdir + '/tmp.lat'
-
-
-def findLattice(timeDetail):
-    '''
-    Finds the lattice corresponding to a time segment
-    '''
-    if os.path.isfile(latticeLocation + timeDetail + '.lat'):
-        return latticeLocation + timeDetail + '.lat'
-    else:
-        return -1
-
-
-# Now read list of files in conversations
-fileList = []
-for line in conversationList:
-    line = line.strip()
-    line = line[:-4]
-    fileList.append(line)
-
-# IN what order were the conversations added to the spanish files?
-# Now get timing information to concatenate the ASR outputs
-
-lineNo = 1
-for item in fileList:
-    timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es')
-    for line in timingFile:
-        timeInfo = line.split()
-
-        # For utterances that are concatenated in the translation file,
-        # the corresponding FSTs have to be translated as well
-        mergedTranslation = ""
-        for timeDetail in timeInfo:
-            tmp = findLattice(timeDetail)
-            if tmp != -1:
-                # Concatenate lattices
-                mergedTranslation = latticeConcatenate(mergedTranslation, tmp)
-
-        print(mergedTranslation)
-        if mergedTranslation != "":
-
-            # Sanjeev's Recipe : Remove epsilons and topo sort
-            finalFST = tmpdir + "/final.fst"
-            os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST)
-
-            # Now convert to PLF
-            proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable +  ' ' + finalFST, stdout=subprocess.PIPE, shell=True)
-            PLFline = proc.stdout.readline()
-            finalPLFFile = tmpdir + "/final.plf"
-            finalPLF = open(finalPLFFile, "w+")
-            finalPLF.write(PLFline)
-            finalPLF.close()
-
-            # now check if this is a valid PLF, if not write it's ID in a
-            # file so it can be checked later
-            proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
-            line = proc.stdout.readline()
-            print("{} {}".format(line, lineNo))
-            if line.strip() != "PLF format appears to be correct.":
-                os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
-                invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
-                rmLines.write("{}\n".format(lineNo))
-            else:
-                provFile.write(PLFline)
-        else:
-            blankPLF.write(timeInfo[0] + "\n")
-            rmLines.write("{}\n".format(lineNo))
-        # Now convert to PLF
-        lineNo += 1
-
-provFile.close()
-invalidPLF.close()
-blankPLF.close()
-rmLines.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh
deleted file mode 100755
index 451a7c529fb..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_oracle.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env bash
-
-# Gets lattice oracles
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-if [ $# -lt 3 ]; then
-    echo "Specify lattice dir, symbol table and text file for partition"
-    exit 1;
-fi
-
-latticeDir=$1
-textFile=$3
-symTable=$2
-oracleDir=$latticeDir/oracle
-
-echo $latticeDir
-echo $oracleDir
-
-. ./path.sh
-
-if [ ! -f $textFile -o ! -f $symTable -o ! -d $latticeDir ]; then
-    echo "Required files not found"
-    exit 1;
-fi
-
-mkdir -p $oracleDir
-
-cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | \
-    utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \
-    $KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log
-
-sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py
deleted file mode 100755
index fc13a7af701..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_rnnlm_wordlist.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-#
-#    2018  Saikiran Valluri, GoVivace inc.
-
-import os, sys
-
-if len(sys.argv) < 5:
-    print( "Usage: python get_rnnlm_wordlist.py <ASR lexicon words> <POCOLM wordslist> <RNNLM wordslist output> <OOV wordlist>")
-    sys.exit()
-
-lexicon_words = open(sys.argv[1], 'r', encoding="utf-8")
-pocolm_words = open(sys.argv[2], 'r', encoding="utf-8")
-rnnlm_wordsout = open(sys.argv[3], 'w', encoding="utf-8")
-oov_wordlist = open(sys.argv[4], 'w', encoding="utf-8")
-
-line_count=0
-lexicon=[]
-
-for line in lexicon_words:
-    lexicon.append(line.split()[0])
-    rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
-    line_count = line_count + 1
-
-for line in pocolm_words:
-    if not line.split()[0] in lexicon:
-        oov_wordlist.write(line.split()[0]+'\n')
-        rnnlm_wordsout.write(line.split()[0] + " " + str(line_count)+'\n')
-        line_count = line_count + 1
-
-lexicon_words.close()
-pocolm_words.close()
-rnnlm_wordsout.close()
-oov_wordlist.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py b/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py
deleted file mode 100644
index 3ecd16772d7..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/get_unigram_weights_vocab.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-#
-#    2018  Saikiran Valluri, GoVivace inc.
-
-import os, sys
-
-if len(sys.argv) < 3:
-    print("Usage : python . <pocolmmodelpat> <unigram weights outfile>")
-    print("      Used for generating the unigram weights for second pass vocabulary from the first pass pocolm training metaparameters.")
-    sys.exit()
- 
-pocolmdir=sys.argv[1]
-unigramwts=open(sys.argv[2], 'w')
-
-names = open(pocolmdir+"/names", 'r')
-metaparams = open(pocolmdir+"/metaparameters", 'r')
-
-name_mapper={}
-for line in names:
-    fields=line.split()
-    name_mapper[fields[0]] = fields[1]
-    
-lns = metaparams.readlines()
-for lineno in range(len(name_mapper.keys())):
-    line = lns[lineno]
-    fileid = line.split()[0].split("_")[-1]
-    weight = line.split()[1]
-    unigramwts.write(name_mapper[fileid] + "  " + weight + "\n")
-
-names.close()
-unigramwts.close()
-metaparams.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl
deleted file mode 100755
index 0366dcdacb0..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/isolate_phones.pl
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-# Once the phonetic representation for words is generated by the LDC lexicon
-# This script converts them into a KALDI compatible format
-# In addition, it extends the list of phonemes to consider based on
-# orthograhic representations of those words which do not have stressed vowels
-
-use utf8;
-
-($tmpdir)=$ARGV[0];
-open(L, "<", "$tmpdir/lexicon_raw") || die "Can't open raw lexicon";
-open(P, "<" , "$tmpdir/phones") || die "Can't open phone file";
-open(I, ">$tmpdir/lexicon_one_column") || die "Can't open text file for writing";
-open(E, ">$tmpdir/phones_extended") || die "Can't open ex-phone file for writing";
-binmode(P, ":utf8");
-binmode(L, ":utf8");
-binmode(I, ":utf8");
-binmode(E, ":utf8");
-
-#Get all phones
-my %phones = qw();
-while (<P>) {
-  chomp;
-  $phones{$_} = 1;
-}
-
-print @phones;
-
-while (<L>) {
-  if (substr($_, 0, 1) eq "#") {
-    print I $_;
-    next;
-  }
-  $len = length;
-  $current = 0;
-  $splitWord = "";
-  while ($current < $len) {
-    #First check for two char codes
-    $currentChar2 = substr($_, $current, 2);
-    $currentChar1 = substr($_, $current, 1);
-    if (exists($phones{$currentChar2})) {
-      $splitWord = $splitWord . " " . $currentChar2;
-      $current = $current + 2;
-    }
-    else {
-      # Check if this phone exists
-      if (!exists($phones{$currentChar1})) {
-        $phones{$currentChar1} = 1
-      }
-      $splitWord = $splitWord . " " . $currentChar1;
-      $current = $current + 1;
-    }
-  }
-  $splitWord =~ s/^\s*(.*?)\s*$/$1/;
-  print I $splitWord, "\n";
-}
-
-# Now write the phones to the extended phone file
-foreach my $key (keys %phones) {
-    print E $key, "\n";
-}
-
-close(L);
-close(P);
-close(I);
-close(E);
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh
deleted file mode 100755
index bbe0af5810c..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/latconvert.sh
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/env bash
-# Author : Gaurav Kumar, Johns Hopkins University
-# Creates OpenFST lattices from Kaldi lattices
-# This script needs to be run from one level above this directory
-
-. ./path.sh
-
-if [ $# -lt 3 ]; then
-  echo "Enter the latdir (where the lattices will be put), the decode dir containing lattices and the acoustic scale"
-  exit 1
-fi
-
-prunebeam=2
-
-latdir=$1
-decode_dir=$2
-acoustic_scale=$3
-#latdir="latjosh-2-callhome"
-#decode_dir=exp/tri5a/decode_$partition
-#acoustic_scale=0.077
-
-stage=0
-
-if [ -d $decode_dir ]
-then
-  # TODO:Add scaling factor for weights, how?
-  rawLatDir="lattices"
-  compiledLatDir="lattices-bin"
-  preplfLatDir="lattices-pushed"
-
-  mkdir -p $latdir
-  mkdir -p $latdir/$rawLatDir
-  mkdir -p $latdir/$compiledLatDir
-  mkdir -p $latdir/$preplfLatDir
-
-  for l in $decode_dir/lat.*.gz
-  do
-    (
-    # Extract file name and unzip the file first
-    bname=${l##*/}
-    bname="$latdir/${bname%.gz}"
-    gunzip -c $l > "$bname.bin"
-
-    if [ $stage -le 0 ]; then
-
-      # Now copy into ark format
-      $KALDI_ROOT/src/latbin/lattice-copy ark:$bname.bin ark,t:- > "$bname.raw"
-
-      # Prune lattices
-      $KALDI_ROOT/src/latbin/lattice-prune --acoustic-scale=$acoustic_scale --beam=$prunebeam ark:"$bname.raw" ark:"$bname.pruned"
-
-      # Convert to an openfst compatible format
-      $KALDI_ROOT/src/latbin/lattice-to-fst --lm-scale=1.0 --acoustic-scale=$acoustic_scale ark:$bname.pruned ark,t:$bname.ark.fst
-
-    fi
-
-    if [ $stage -le 1 ]; then
-      fileName=""
-      fileLine=0
-
-      while read line; do
-        if [ $fileLine = 0 ]; then
-          fileName="$line"
-          fileLine=1
-          continue
-        fi
-        if [ -z "$line" ]; then
-          fileLine=0
-          continue
-        fi
-        # Replace laugh, unk, oov, noise with eps
-        echo "$line" | awk '{if ($3 == 2038 || $3 == 2039 || $3 == 2040) {$3 = 0; $4 = 0} print}' >> "$latdir/$rawLatDir/$fileName.lat"
-      done < $bname.ark.fst
-      echo "Done isolating lattices"
-    fi
-    ) &
-  done
-  wait
-  rm $latdir/*.bin
-  rm $latdir/*.pruned
-
-
-  if [ $stage -le 2 ]; then
-    #Compile lattices
-    for l in $latdir/$rawLatDir/*.lat
-    do
-      (
-      # Arc type needs to be log
-      bname=${l##*/}
-      fstcompile --arc_type=log $latdir/$rawLatDir/$bname $latdir/$compiledLatDir/$bname
-      ) &
-    done
-    wait
-    echo "Done compiling lattices."
-  fi
-
-  if [ $stage -le 3 ]; then
-    #Sanjeev's Recipe for creating valid PLF compatible FSTs"
-    # Create a dummy FST with one state and no arcs first
-    echo 0 | fstcompile --arc_type=log - $latdir/$preplfLatDir/dummy.fst
-    # Push Lattice weights towards initial state
-    for l in $latdir/$compiledLatDir/*.lat
-    do
-      (
-      bname=${l##*/}
-      fstrmepsilon $latdir/$compiledLatDir/$bname | \
-        fstpush --push_weights --remove_total_weight - | \
-        # Do not topo sort here, do it before converting into PLF
-      # Sanjeev's Recipe : Concatenate with dummy FST
-      fstconcat - $latdir/$preplfLatDir/dummy.fst | \
-        fstreverse - | \
-        fstrmepsilon - | \
-        fstreverse - $latdir/$preplfLatDir/$bname
-      ) &
-    done
-    wait
-    # Let's take a moment to thank the dummy FST for playing its
-    # part in this process. However, it has to go now.
-    rm $latdir/$preplfLatDir/dummy.fst
-    echo "Done performing fst push (initial state)"
-  fi
-else
-  echo "Complete training and decoding first"
-fi
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py
deleted file mode 100755
index 94546dc44c3..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/merge_lexicons.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-#    2018  Saikiran Valluri, GoVivace inc., Avaaya
-
-# Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
-from __future__ import print_function
-import sys
-import re
-import json
-import codecs
-import operator
-
-wordlimit = 64000
-tmpdir = sys.argv[1]
-ldc_lexicon = sys.argv[2]
-uw_fisher = tmpdir + "/uniquewords"
-uw_gigaword = tmpdir + "/es_wordlist.json"
-uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
-
-filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]')
-merged_lexicon = []
-# All three lexicons are in different formats
-# First add the data from lexicon_fisher (A) into the dictionary
-fisher = codecs.open(uw_fisher, encoding='utf-8')
-for line in fisher:
-    merged_lexicon.append(line.strip())
-fisher.close()
-
-print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon)))
-
-# Now add data from the LDC lexicon
-ldc = codecs.open(uw_LDC, encoding='iso-8859-1')
-for line in ldc:
-    entries = line.strip().split('\t')
-    if entries[0].lower() not in merged_lexicon:
-        merged_lexicon.append(entries[0].lower())
-
-print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon)))
-
-# Finally add the gigaword data
-gigaword = json.load(open(uw_gigaword))
-gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1)))
-
-for item in gigaword:
-    # We need a maximum of wordlimit words in the lexicon
-    if len(merged_lexicon) == wordlimit:
-        break
-
-    if item[0].lower() not in merged_lexicon:
-        merged_lexicon.append(item[0].lower())
-
-print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon)))
-
-# Now write the uniquewords to a file
-lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+')
-ltuples = sorted(merged_lexicon)
-
-for item in ltuples:
-    if not item==u'ñ' and not re.search(filtered_letters, item):
-        lf.write(item + "\n")
-
-lf.close()
-
-print("Finshed writing unique words")
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh
deleted file mode 100755
index a95893f698a..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/monitor_denlats.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-currentJob=0
-
-dir=/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/exp/sgmm2x_6a_denlats
-
-for f in $dir/.done.*; do
-    d=`echo ${f##*/} | awk 'BEGIN {FS="."} {print $3}'`
-    if [ $d -gt $currentJob ]; then
-        currentJob=$d
-    fi
-done
-
-currentJob=$((currentJob+1))
-
-echo Currently processing job : $currentJob
-
-for i in $(seq 210); do
-    job[$i]=$i
-done
-
-dir=/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/exp/sgmm2x_6a_denlats/log/$currentJob/q
-
-for f in $dir/done.*; do
-    d=`echo ${f##*/} | awk 'BEGIN {FS="."} {print $3}'`
-    unset job[$d]
-done
-
-echo sub-splits left : ${#job[@]}
-echo ${job[@]}
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh
deleted file mode 100755
index cc9de4d26c5..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/nnet3/run_ivector_common.sh
+++ /dev/null
@@ -1,187 +0,0 @@
-#!/bin/bash
-
-set -e -o pipefail
-
-# This script is called from scripts like local/nnet3/run_tdnn.sh and
-# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
-# contains the common feature preparation and iVector-related parts of the
-# script.  See those scripts for examples of usage.
-
-
-stage=7
-nj=30
-train_set=train   # you might set this to e.g. train.
-test_sets="test dev"
-gmm=tri5a                # This specifies a GMM-dir from the features of the type you're training the system on;
-                         # it should contain alignments for 'train_set'.
-
-num_threads_ubm=32
-nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
-                         # in the tedlium recip it's _cleaned).
-
-. ./cmd.sh
-. ./path.sh
-. utils/parse_options.sh
-
-
-gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp
-
-for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
-  if [ ! -f $f ]; then
-    echo "$0: expected file $f to exist"
-    exit 1
-  fi
-done
-
-
-
-if [ $stage -le 7 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
-  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
-  echo " ... Please either remove it, or rerun this script with stage > 7."
-  exit 1
-fi
-
-
-if [ $stage -le 8 ]; then
-  echo "$0: preparing directory for speed-perturbed data"
-  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
-fi
-
-if [ $stage -le 9 ]; then
-  echo "$0: creating high-resolution MFCC features"
-
-  # this shows how you can split across multiple file-systems.  we'll split the
-  # MFCC dir across multiple locations.  You might want to be careful here, if you
-  # have multiple copies of Kaldi checked out and run the same recipe, not to let
-  # them overwrite each other.
-  mfccdir=data/${train_set}_sp_hires/data
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
-  fi
-
-  for datadir in ${train_set}_sp ${test_sets}; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-  done
-
-  # do volume-perturbation on the training data prior to extracting hires
-  # features; this helps make trained nnets more invariant to test data volume.
-  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
-
-  for datadir in ${train_set}_sp ${test_sets}; do
-    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires
-    steps/compute_cmvn_stats.sh data/${datadir}_hires
-    utils/fix_data_dir.sh data/${datadir}_hires
-  done
-fi
-
-if [ $stage -le 10 ]; then
-  echo "$0: computing a subset of data to train the diagonal UBM."
-
-  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
-  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
-
-  # train a diagonal UBM using a subset of about a quarter of the data
-  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
-  num_utts=$[$num_utts_total/4]
-  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
-      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
-
-  echo "$0: computing a PCA transform from the hires data."
-  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
-      --splice-opts "--left-context=3 --right-context=3" \
-      --max-utts 10000 --subsample 2 \
-       ${temp_data_root}/${train_set}_sp_hires_subset \
-       exp/nnet3${nnet3_affix}/pca_transform
-
-  echo "$0: training the diagonal UBM."
-  # Use 512 Gaussians in the UBM.
-  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
-    --num-frames 700000 \
-    --num-threads $num_threads_ubm \
-    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
-    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
-
-fi
-
-if [ $stage -le 11 ]; then
-  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
-  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
-  # 100.
-  echo "$0: training the iVector extractor"
-  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
-fi
-
-if [ $stage -le 12 ]; then
-  # note, we don't encode the 'max2' in the name of the ivectordir even though
-  # that's the data we extract the ivectors from, as it's still going to be
-  # valid for the non-'max2' data; the utterance list is the same.
-  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
-    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/wsj-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
-  fi
-
-  # We now extract iVectors on the speed-perturbed training data .  With
-  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
-  # each of these pairs as one speaker; this gives more diversity in iVectors..
-  # Note that these are extracted 'online' (they vary within the utterance).
-
-  # Having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (the iVector starts at zero at the beginning
-  # of each pseudo-speaker).
-  temp_data_root=${ivectordir}
-  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
-
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-    ${temp_data_root}/${train_set}_sp_hires_max2 \
-    exp/nnet3${nnet3_affix}/extractor $ivectordir
-
-  # Also extract iVectors for the test data, but in this case we don't need the speed
-  # perturbation (sp).
-  for data in ${test_sets}; do
-    nspk=$(wc -l <data/${data}_hires/spk2utt)
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
-      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
-      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
-  done
-fi
-
-if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
-  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
-  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
-  exit 1;
-fi
-
-
-if [ $stage -le 13 ]; then
-  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
-  utils/data/perturb_data_dir_speed_3way.sh \
-    data/${train_set} data/${train_set}_sp
-fi
-
-if [ $stage -le 14 ]; then
-  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
-  steps/make_mfcc.sh --nj $nj \
-    --cmd "$train_cmd" data/${train_set}_sp
-  steps/compute_cmvn_stats.sh data/${train_set}_sp
-  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
-  echo ".. speed-perturbed segments were too short."
-  utils/fix_data_dir.sh data/${train_set}_sp
-fi
-
-if [ $stage -le 15 ]; then
-  if [ -f $ali_dir/ali.1.gz ]; then
-    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
-    echo " ... or use a later --stage option."
-    exit 1
-  fi
-  echo "$0: aligning with the perturbed low-resolution data"
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/${train_set}_sp data/lang $gmm_dir $ali_dir
-fi
-
-
-exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
deleted file mode 100755
index 0e71be29119..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/pocolm_cust.sh
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/usr/bin/env bash
-
-# this script generates Pocolm-estimated language models with various
-# data sources in data/text folder and places the output in data/lm.
-
-set -euo pipefail
-
-. ./path.sh
-
-export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
-export PATH=$PATH:$POCOLM_ROOT/scripts
-
-
-wordlist=None
-num_word=100000
-pocolm_stage=1
-ngram_order=3
-lm_dir=
-arpa_dir=
-textdir=
-max_memory='--max-memory=8G'
-
-. ./cmd.sh
-. ./utils/parse_options.sh
-
-
-# If you do not want to set memory limitation for "sort", you can use
-#max_memory=
-# Choices for the max-memory can be:
-# 1) integer + 'K', 'M', 'G', ...
-# 2) integer + 'b', meaning unit is byte and no multiplication
-# 3) integer + '%', meaning a percentage of memory
-# 4) integer, default unit is 'K'
-
-fold_dev_opt=
-# If you want to fold the dev-set in to the 'swbd1' set to produce the final
-# model, un-comment the following line.  For use in the Kaldi example script for
-# ASR, this isn't suitable because the 'dev' set is the first 10k lines of the
-# switchboard data, which we also use as dev data for speech recognition
-# purposes.
-#fold_dev_opt="--fold-dev-into=swbd1"
-
-bypass_metaparam_optim_opt=
-# If you want to bypass the metaparameter optimization steps with specific metaparameters
-# un-comment the following line, and change the numbers to some appropriate values.
-# You can find the values from output log of train_lm.py.
-# These example numbers of metaparameters is for 3-gram model running with train_lm.py.
-# the dev perplexity should be close to the non-bypassed model.
-#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.091,0.867,0.753,0.275,0.100,0.018,0.902,0.371,0.183,0.070"
-# Note: to use these example parameters, you may need to remove the .done files
-# to make sure the make_lm_dir.py be called and tain only 3-gram model
-#for order in 3; do
-#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
-
-limit_unk_history_opt=
-# If you want to limit the left of <unk> in the history of a n-gram
-# un-comment the following line
-#limit_unk_history_opt="--limit-unk-history=true"
-
-for order in ${ngram_order}; do
-  # decide on the vocabulary.
-  # Note: you'd use --wordlist if you had a previously determined word-list
-  # that you wanted to use.
-  lm_name="${num_word}_${order}"
-  min_counts=''
-  # Note: the following might be a more reasonable setting:
-  # min_counts='fisher=2 swbd1=1'
-  if [ -n "${min_counts}" ]; then
-    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
-  fi
-  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
-  train_lm.py --num-words=${num_word} --num-splits=5 --warm-start-ratio=10 ${max_memory} \
-              --min-counts=${min_counts} \
-              --keep-int-data=true ${fold_dev_opt} ${bypass_metaparam_optim_opt} \
-              ${limit_unk_history_opt} ${textdir} ${order} ${lm_dir}/work ${unpruned_lm_dir}
-
-  if [ $pocolm_stage -eq 2 ];then
-  mkdir -p ${arpa_dir}
-  format_arpa_lm.py ${max_memory} ${unpruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_unpruned.arpa.gz
-
-  # example of pruning.  note: the threshold can be less than or more than one.
-  get_data_prob.py ${max_memory} ${textdir}/dev.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
-  for threshold in 1.0 2.0 4.0; do
-    pruned_lm_dir=${lm_dir}/${lm_name}_prune${threshold}.pocolm
-    prune_lm_dir.py --final-threshold=${threshold} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 5 | head -n 3
-    get_data_prob.py ${max_memory} ${textdir}/dev.txt ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
-
-    format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${threshold}.arpa.gz
-
-  done
-
-  # example of pruning by size.
-  size=1000000
-  pruned_lm_dir=${lm_dir}/${lm_name}_prune${size}.pocolm
-  prune_lm_dir.py --target-num-ngrams=${size} ${max_memory} ${unpruned_lm_dir} ${pruned_lm_dir} 2>&1 | tail -n 8 | head -n 6 | grep -v 'log-prob changes'
-  get_data_prob.py ${textdir}/dev.txt ${max_memory} ${pruned_lm_dir} 2>&1 | grep -F '[perplexity'
-
-  format_arpa_lm.py ${max_memory} ${pruned_lm_dir} | gzip -c > ${arpa_dir}/${lm_name}_${order}gram_prune${size}.arpa.gz
-  fi
-done
-
-# (run local/srilm_baseline.sh ${num_word} to see the following result e.g. local/srilm_baseline.sh 40000 )
-
-# the following does does some self-testing, including
-# that the computed derivatives are accurate.
-# local/self_test.sh
-
-# perplexities from pocolm-estimated language models with pocolm's interpolation
-# method from orders 3, 4, and 5 are:
-# order 3: optimize_metaparameters.py: final perplexity without barrier function was -4.358818 (perplexity: 78.164689)
-# order 4: optimize_metaparameters.py: final perplexity without barrier function was -4.309507 (perplexity: 74.403797)
-# order 5: optimize_metaparameters.py: final perplexity without barrier function was -4.301741 (perplexity: 73.828181)
-
-# note, the perplexities from pocolm-estimated language models with SRILM's
-# interpolation from orders 3 and 4 are (from local/pocolm_with_srilm_combination.sh),
-# 78.8449 and 75.2202 respectively.
-
-# note, the perplexities from SRILM-estimated language models with SRILM's
-# interpolation tool from orders 3 and 4 are (from local/srilm_baseline.sh),
-# 78.9056 and 75.5528 respectively.
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py b/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py
deleted file mode 100755
index 5c68e1204b2..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/process_oracle.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-# Processes lattice oracles
-
-import os
-import sys
-
-oracleDir = "exp/tri5a/decode_callhome_train/oracle"
-wordsFile = open('exp/sgmm2x_6a/graph/words.txt')
-conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/train')
-oracleTmp = 'exp/tri5a/one-best/oracle-ch-train'
-provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/oracle.train', 'w+')
-timLocation = '/export/a04/gkumar/corpora/fishcall/callhome/tim'
-
-def findTranscription(timeDetail):
-  file1 = open(oracleDir + "/oracle.tra")
-  for line in file1:
-    lineComp = line.split()
-    if lineComp[0] == timeDetail:
-      return " ".join(lineComp[1:])
-  # No result found
-  return -1
-
-words = {}
-
-# Extract word list
-for line in wordsFile:
-  lineComp = line.split()
-  words[int(lineComp[1])] = lineComp[0].strip()
-
-# Now read list of files in conversations
-fileList = []
-for line in conversationList:
-    line = line.strip()
-    line = line[:-4]
-    fileList.append(line)
-
-# IN what order were the conversations added to the spanish files?
-# TODO: Make sure they match the order in which these english files are being written
-
-# Now get timing information to concatenate the ASR outputs
-if not os.path.exists(oracleTmp):
-  os.makedirs(oracleTmp)
-
-#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+')
-for item in fileList:
-  timingFile = open(timLocation + '/' + item + '.es')
-  newFile = open(oracleTmp + '/' + item + '.es', 'w+')
-  for line in timingFile:
-    timeInfo = line.split()
-    mergedTranslation = ""
-    for timeDetail in timeInfo:
-      #Locate this in ASR dev/test, this is going to be very slow
-      tmp = findTranscription(timeDetail)
-      if tmp != -1:
-        mergedTranslation = mergedTranslation + " " + tmp
-    mergedTranslation = mergedTranslation.strip()
-    transWords = [words[int(x)] for x in mergedTranslation.split()]
-    newFile.write(" ".join(transWords) + "\n")
-    provFile.write(" ".join(transWords) + "\n")
-
-  newFile.close()
-provFile.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh
deleted file mode 100755
index 1b54b304e50..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/rescore.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-. ./cmd.sh
-
-for iter in 1 2 3 4; do
-      steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
-      --transform-dir exp/tri5a/decode_test data/lang data/test exp/sgmm2x_6a/decode_test_fmllr \
-      exp/sgmm2x_6a_mmi_b0.2/decode_test_fmllr_it$iter &
-done
-
-
-for iter in 1 2 3 4; do
-      steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
-      --transform-dir exp/tri5a/decode_dev data/lang data/dev exp/sgmm2x_6a/decode_dev_fmllr \
-      exp/sgmm2x_6a_mmi_b0.2/decode_dev_fmllr_it$iter &
-done
-
-
-for iter in 1 2 3 4; do
-      steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
-      --transform-dir exp/tri5a/decode_dev2 data/lang data/dev2 exp/sgmm2x_6a/decode_dev2_fmllr \
-      exp/sgmm2x_6a_mmi_b0.2/decode_dev2_fmllr_it$iter &
-done
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
deleted file mode 100755
index 3850910f312..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/rnnlm.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-#           2015  Guoguo Chen
-#           2017  Hainan Xu
-#           2017  Xiaohui Zhang
-
-# This script trains LMs on the swbd LM-training data.
-
-# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration.
-# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0.
-# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71
-# Dev objf:   -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91
-
-
-dir=Spanish_gigawrd/rnnlm
-pocolm_dir=Spanish_gigawrd/work_pocolm/lm/110000_3.pocolm_pruned
-wordslist=
-embedding_dim=1024
-lstm_rpd=256
-lstm_nrpd=256
-stage=0
-train_stage=-30
-text_dir=Spanish_gigawrd/text_lm
-
-. ./cmd.sh
-. ./utils/parse_options.sh
-
-mkdir -p $dir/config
-set -e
-
-for f in $text_dir/dev.txt; do
-    [ ! -f $f ] && \
-	echo "$0: expected file $f to exist;" && exit 1
-done
-
-if [ $stage -le 0 ]; then
-    if [ -f $text_dir/unigram_weights ] ; then
-	mv $text_dir/unigram_weights $pocolm_dir/
-    fi
-    cp $wordslist $dir/config/words.txt
-    n=`cat $dir/config/words.txt | wc -l`
-    echo "<brk> $n" >> $dir/config/words.txt
-
-    # words that are not present in words.txt but are in the training or dev data, will be
-    # mapped to <SPOKEN_NOISE> during training.
-    echo "<unk>" >$dir/config/oov.txt
-    local/get_data_weights.pl $pocolm_dir $dir/config/data_weights.txt 
-    rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
-				 --unk-word="<unk>" \
-				 --data-weights-file=$dir/config/data_weights.txt \
-				 $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
-    
-      # choose features
-      rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
-			       --use-constant-feature=true \
-			       --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
-			       $dir/config/words.txt > $dir/config/features.txt
-fi
-
-if [ $stage -le 1 ]; then
-        cat <<EOF >$dir/config/xconfig 
-	input dim=$embedding_dim name=input
-	relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
-	fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
-	relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
-	fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
-	relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
-	output-layer name=output include-log-softmax=false dim=$embedding_dim
-EOF
-	rnnlm/validate_config_dir.sh $text_dir $dir/config
-fi
-
-if [ $stage -le 2 ]; then
-    rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
-fi
-
-if [ $stage -le 3 ]; then
-    rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 2 \
-			 --stage $train_stage --num-epochs 5 --cmd "$train_cmd" $dir
-fi
-
-exit 0
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh
deleted file mode 100755
index f88fecc815c..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/run_norm.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-punctuation_symbols=( "," "\"" "\`" "\:" "(" ")" "-" ";" "?" "!" "/" "_" "{" "}" "*" )
-
-config=$1
-path_prefix=$2
-data=$3
-job=$4
-dir=$5
-
-substitute_arg=""
-num_syms=0
-
-for i in "${punctuation_symbols[@]}"; do
-    symbol=${punctuation_symbols[${num_syms}]}
-    if [ $num_syms -eq 0 ]; then
-	substitute_arg="sed 's:${i}: :g'"
-    else
-	substitute_arg=$substitute_arg" | sed 's:${i}: :g'"
-    fi
-    substitute_arg=$substitute_arg" |sed 's:${i}$: :g' | sed 's:^${i}: :g'"
-    num_syms=$((num_syms+1))
-done
-mkdir -p $dir/normalize/$job
-local/clean_abbrevs_text.py $data/$job $data/"$job"_processed
-mv $data/"$job"_processed $data/$job
-echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh
- 
-bash $dir/normalize/$job/substitute.sh | \
-    sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \
-    sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç'  > $dir/normalize/$job/text
-normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt
-
-exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh
deleted file mode 100755
index 9148b1f1171..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/run_sgmm2x.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/bin/bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-# This is as run_sgmm2.sh but excluding the "speaker-dependent weights",
-# so not doing the symmetric SGMM.
-
-. ./cmd.sh
-
-## SGMM on top of LDA+MLLT+SAT features.
-if [ ! -f exp/ubm6a/final.mdl ]; then
-  steps/train_ubm.sh --silence-weight 0.5 --cmd "$train_cmd" 800 data/train data/lang exp/tri5a_ali exp/ubm6a || exit 1;
-fi
-# Double the number of SAT states : sanjeev
-steps/train_sgmm2.sh  --spk-dep-weights false --cmd "$train_cmd" 10000 120000 \
-  data/train data/lang exp/tri5a_ali exp/ubm6a/final.ubm exp/sgmm2x_6a || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/sgmm2x_6a exp/sgmm2x_6a/graph || exit 1;
-
-steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
-  --transform-dir exp/tri5a/decode_dev  exp/sgmm2x_6a/graph data/dev exp/sgmm2x_6a/decode_dev || exit 1;
-
-steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
-  --transform-dir exp/tri5a/decode_dev  exp/sgmm2x_6a/graph data/dev exp/sgmm2x_6a/decode_dev_fmllr || exit 1;
-
-steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
-  --transform-dir exp/tri5a/decode_test  exp/sgmm2x_6a/graph data/test exp/sgmm2x_6a/decode_test || exit 1;
-
-steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
-  --transform-dir exp/tri5a/decode_test  exp/sgmm2x_6a/graph data/test exp/sgmm2x_6a/decode_test_fmllr || exit 1;
-
-steps/decode_sgmm2.sh --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
-  --transform-dir exp/tri5a/decode_dev2  exp/sgmm2x_6a/graph data/dev2 exp/sgmm2x_6a/decode_dev2 || exit 1;
-
-steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 25 --cmd "$decode_cmd" \
-  --transform-dir exp/tri5a/decode_dev2  exp/sgmm2x_6a/graph data/dev2 exp/sgmm2x_6a/decode_dev2_fmllr || exit 1;
-
- #  Now we'll align the SGMM system to prepare for discriminative training.
- steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri5a \
-    --use-graphs true --use-gselect true data/train data/lang exp/sgmm2x_6a exp/sgmm2x_6a_ali || exit 1;
- steps/make_denlats_sgmm2.sh --nj 30 --sub-split 210 --cmd "$decode_cmd" --transform-dir exp/tri5a \
-   data/train data/lang exp/sgmm2x_6a_ali exp/sgmm2x_6a_denlats
- steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri5a --boost 0.2 \
-   data/train data/lang exp/sgmm2x_6a_ali exp/sgmm2x_6a_denlats exp/sgmm2x_6a_mmi_b0.2
-
- for iter in 1 2 3 4; do
-  steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
-    --transform-dir exp/tri5a/decode_test data/lang data/test exp/sgmm2x_6a/decode_test exp/sgmm2x_6a_mmi_b0.2/decode_test_it$iter &
- done
-
-wait
-steps/decode_combine.sh data/test data/lang exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode || exit 1;
-steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_mmi/decode exp/combine_sgmm2x_4a_3b/decode || exit 1;
-# combining the sgmm run and the best MMI+fMMI run.
-steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_fmmi_c/decode_it5 exp/combine_sgmm2x_4a_3b_fmmic5/decode || exit 1;
-
-steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a_mmi_b0.2/decode_it4 exp/tri3b_fmmi_c/decode_it5 exp/combine_sgmm2x_4a_mmi_3b_fmmic5/decode || exit 1;
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh
deleted file mode 120000
index 0afefc3158c..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/score.sh
+++ /dev/null
@@ -1 +0,0 @@
-../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh
deleted file mode 100755
index 21b793a4d27..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/score_oracle.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-oracle_dir=exp/tri5a/decode_callhome_test/oracle
-split=callhome_test
-data_dir=data/callhome_test
-lang_dir=data/lang
-
-# Make sure that your STM and CTM files are in UTF-8 encoding
-# Any other encoding will cause this script to fail/misbehave
-
-if [ ! -e $oracle_dir -o ! -e $data_dir -o ! -e $lang_dir ]; then
-  echo "Missing pre-requisites"
-  exit 1
-fi
-
-for i in {5..20}; do
-    mkdir -p $oracle_dir/score_$i
-    cp $oracle_dir/$split.ctm $oracle_dir/score_$i/
-done
-
-. /export/babel/data/software/env.sh
-
-# Start scoring
-/export/a11/guoguo/babel/103-bengali-limitedLP.official/local/score_stm.sh $data_dir $lang_dir \
-    $oracle_dir
-
-# Print a summary of the result
-grep "Percent Total Error" $oracle_dir/score_*/$split.ctm.dtl
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev
deleted file mode 100644
index 77e3b01786f..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/dev
+++ /dev/null
@@ -1,20 +0,0 @@
-sp_0897.sph
-sp_0968.sph
-sp_0981.sph
-sp_1062.sph
-sp_1292.sph
-sp_1411.sph
-sp_1413.sph
-sp_1552.sph
-sp_1554.sph
-sp_1805.sph
-sp_1808.sph
-sp_1882.sph
-sp_1930.sph
-sp_1947.sph
-sp_2037.sph
-sp_2054.sph
-sp_2057.sph
-sp_2107.sph
-sp_2109.sph
-sp_2144.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev
deleted file mode 100644
index 77e3b01786f..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/dev
+++ /dev/null
@@ -1,20 +0,0 @@
-sp_0897.sph
-sp_0968.sph
-sp_0981.sph
-sp_1062.sph
-sp_1292.sph
-sp_1411.sph
-sp_1413.sph
-sp_1552.sph
-sp_1554.sph
-sp_1805.sph
-sp_1808.sph
-sp_1882.sph
-sp_1930.sph
-sp_1947.sph
-sp_2037.sph
-sp_2054.sph
-sp_2057.sph
-sp_2107.sph
-sp_2109.sph
-sp_2144.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test
deleted file mode 100644
index 0cbc3cc95fd..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/test
+++ /dev/null
@@ -1,20 +0,0 @@
-sp_0053.sph
-sp_0082.sph
-sp_0084.sph
-sp_0088.sph
-sp_0681.sph
-sp_0699.sph
-sp_0776.sph
-sp_0857.sph
-sp_1031.sph
-sp_1100.sph
-sp_1148.sph
-sp_1156.sph
-sp_1186.sph
-sp_1212.sph
-sp_1345.sph
-sp_1435.sph
-sp_1578.sph
-sp_1648.sph
-sp_1807.sph
-sp_1847.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train
deleted file mode 100644
index 2c936072534..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_callhome/train
+++ /dev/null
@@ -1,80 +0,0 @@
-sp_0085.sph
-sp_0096.sph
-sp_0098.sph
-sp_0100.sph
-sp_0291.sph
-sp_0713.sph
-sp_0724.sph
-sp_0726.sph
-sp_0731.sph
-sp_0733.sph
-sp_0753.sph
-sp_0788.sph
-sp_0826.sph
-sp_0831.sph
-sp_0836.sph
-sp_0841.sph
-sp_0850.sph
-sp_0855.sph
-sp_0892.sph
-sp_0899.sph
-sp_0910.sph
-sp_0917.sph
-sp_0919.sph
-sp_0923.sph
-sp_0945.sph
-sp_0950.sph
-sp_0951.sph
-sp_0992.sph
-sp_0997.sph
-sp_1013.sph
-sp_1039.sph
-sp_1044.sph
-sp_1045.sph
-sp_1058.sph
-sp_1060.sph
-sp_1063.sph
-sp_1081.sph
-sp_1106.sph
-sp_1122.sph
-sp_1140.sph
-sp_1175.sph
-sp_1195.sph
-sp_1198.sph
-sp_1231.sph
-sp_1234.sph
-sp_1255.sph
-sp_1260.sph
-sp_1261.sph
-sp_1262.sph
-sp_1264.sph
-sp_1266.sph
-sp_1273.sph
-sp_1275.sph
-sp_1284.sph
-sp_1286.sph
-sp_1304.sph
-sp_1308.sph
-sp_1333.sph
-sp_1341.sph
-sp_1353.sph
-sp_1368.sph
-sp_1379.sph
-sp_1384.sph
-sp_1449.sph
-sp_1463.sph
-sp_1574.sph
-sp_1740.sph
-sp_1759.sph
-sp_1849.sph
-sp_1908.sph
-sp_1915.sph
-sp_1918.sph
-sp_1974.sph
-sp_1976.sph
-sp_1988.sph
-sp_2000.sph
-sp_2056.sph
-sp_2070.sph
-sp_2091.sph
-sp_2101.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev
deleted file mode 100644
index d3769f0ffb5..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev
+++ /dev/null
@@ -1,20 +0,0 @@
-20051009_182032_217_fsp.sph
-20051009_210519_219_fsp.sph
-20051010_212418_225_fsp.sph
-20051016_180547_265_fsp.sph
-20051016_210626_267_fsp.sph
-20051017_180712_270_fsp.sph
-20051017_220530_275_fsp.sph
-20051017_234550_276_fsp.sph
-20051018_210220_279_fsp.sph
-20051018_210744_280_fsp.sph
-20051019_190221_288_fsp.sph
-20051019_210146_289_fsp.sph
-20051019_230329_292_fsp.sph
-20051022_180817_311_fsp.sph
-20051023_232057_325_fsp.sph
-20051024_180453_327_fsp.sph
-20051024_181110_329_fsp.sph
-20051025_212334_337_fsp.sph
-20051026_180724_341_fsp.sph
-20051026_211309_346_fsp.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2 b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2
deleted file mode 100644
index f1b5c293d67..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/dev2
+++ /dev/null
@@ -1,20 +0,0 @@
-20050909_210655_26_fsp.sph
-20050910_210708_33_fsp.sph
-20050913_210933_49_fsp.sph
-20050913_211649_50_fsp.sph
-20050915_210434_65_fsp.sph
-20050916_180332_68_fsp.sph
-20050918_180733_81_fsp.sph
-20050918_210841_82_fsp.sph
-20050920_212030_93_fsp.sph
-20050921_210443_99_fsp.sph
-20050923_211304_115_fsp.sph
-20050925_180713_120_fsp.sph
-20050925_180825_121_fsp.sph
-20050926_180516_125_fsp.sph
-20050926_180555_126_fsp.sph
-20050928_000254_141_fsp.sph
-20050930_210540_161_fsp.sph
-20051002_180726_170_fsp.sph
-20051007_181850_205_fsp.sph
-20051007_191217_206_fsp.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test
deleted file mode 100644
index 6190ced077c..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/test
+++ /dev/null
@@ -1,20 +0,0 @@
-20051028_180633_356_fsp.sph
-20051029_211606_365_fsp.sph
-20051030_193924_371_fsp.sph
-20051101_212731_386_fsp.sph
-20051102_134901_389_fsp.sph
-20051102_180402_391_fsp.sph
-20051102_181501_393_fsp.sph
-20051103_211105_404_fsp.sph
-20051103_233456_406_fsp.sph
-20051107_184634_438_fsp.sph
-20051109_180253_445_fsp.sph
-20051109_210353_450_fsp.sph
-20051111_181045_470_fsp.sph
-20051111_182216_472_fsp.sph
-20051112_181649_485_fsp.sph
-20051113_155059_492_fsp.sph
-20051113_210221_496_fsp.sph
-20051113_214925_498_fsp.sph
-20051114_181749_505_fsp.sph
-20051115_212123_516_fsp.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train
deleted file mode 100644
index b57683842b2..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/split_fisher/train
+++ /dev/null
@@ -1,759 +0,0 @@
-20050908_182943_22_fsp.sph
-20050908_191808_23_fsp.sph
-20050909_210428_25_fsp.sph
-20050909_221657_28_fsp.sph
-20050910_180310_29_fsp.sph
-20050910_180330_30_fsp.sph
-20050910_181354_31_fsp.sph
-20050910_190223_32_fsp.sph
-20050911_180647_34_fsp.sph
-20050911_200216_35_fsp.sph
-20050911_210429_36_fsp.sph
-20050911_210530_37_fsp.sph
-20050911_210904_38_fsp.sph
-20050912_181441_40_fsp.sph
-20050912_181538_41_fsp.sph
-20050912_182044_42_fsp.sph
-20050912_212913_43_fsp.sph
-20050913_180324_44_fsp.sph
-20050913_180731_46_fsp.sph
-20050913_180947_47_fsp.sph
-20050913_210409_48_fsp.sph
-20050914_000831_51_fsp.sph
-20050914_180332_52_fsp.sph
-20050914_180606_53_fsp.sph
-20050914_181020_54_fsp.sph
-20050914_210243_55_fsp.sph
-20050914_210822_56_fsp.sph
-20050914_220753_58_fsp.sph
-20050915_180728_60_fsp.sph
-20050915_180740_61_fsp.sph
-20050915_192457_62_fsp.sph
-20050915_194045_63_fsp.sph
-20050915_210200_64_fsp.sph
-20050915_210916_66_fsp.sph
-20050915_212325_67_fsp.sph
-20050916_180740_69_fsp.sph
-20050916_200334_70_fsp.sph
-20050916_210235_71_fsp.sph
-20050916_210510_72_fsp.sph
-20050916_223656_73_fsp.sph
-20050917_210406_74_fsp.sph
-20050917_210805_75_fsp.sph
-20050917_211045_76_fsp.sph
-20050917_212041_77_fsp.sph
-20050918_180326_80_fsp.sph
-20050919_000612_83_fsp.sph
-20050919_180511_84_fsp.sph
-20050919_180703_85_fsp.sph
-20050919_180925_86_fsp.sph
-20050919_190254_87_fsp.sph
-20050920_180330_88_fsp.sph
-20050920_180342_89_fsp.sph
-20050920_180607_90_fsp.sph
-20050920_181919_91_fsp.sph
-20050920_211414_92_fsp.sph
-20050920_230520_94_fsp.sph
-20050921_180639_95_fsp.sph
-20050921_181002_96_fsp.sph
-20050921_210340_98_fsp.sph
-20050921_211329_101_fsp.sph
-20050921_221625_102_fsp.sph
-20050922_180618_103_fsp.sph
-20050922_180948_104_fsp.sph
-20050922_210740_106_fsp.sph
-20050922_211003_107_fsp.sph
-20050922_230412_108_fsp.sph
-20050923_180514_110_fsp.sph
-20050923_180530_111_fsp.sph
-20050923_210442_114_fsp.sph
-20050924_180747_117_fsp.sph
-20050924_181124_118_fsp.sph
-20050925_210645_122_fsp.sph
-20050925_231407_123_fsp.sph
-20050926_000425_124_fsp.sph
-20050926_180719_127_fsp.sph
-20050926_220244_130_fsp.sph
-20050926_230706_131_fsp.sph
-20050927_180422_132_fsp.sph
-20050927_181033_133_fsp.sph
-20050927_181232_134_fsp.sph
-20050927_210320_135_fsp.sph
-20050927_210848_136_fsp.sph
-20050927_210947_138_fsp.sph
-20050927_211929_139_fsp.sph
-20050927_231016_140_fsp.sph
-20050928_180631_142_fsp.sph
-20050928_210256_144_fsp.sph
-20050928_210700_145_fsp.sph
-20050928_211113_146_fsp.sph
-20050928_220320_147_fsp.sph
-20050928_232236_148_fsp.sph
-20050929_180318_149_fsp.sph
-20050929_180722_150_fsp.sph
-20050929_180932_151_fsp.sph
-20050929_211337_153_fsp.sph
-20050929_220820_154_fsp.sph
-20050929_230406_155_fsp.sph
-20050930_180329_156_fsp.sph
-20050930_180411_157_fsp.sph
-20050930_180646_158_fsp.sph
-20050930_200308_159_fsp.sph
-20051001_180328_163_fsp.sph
-20051001_181004_164_fsp.sph
-20051001_210749_166_fsp.sph
-20051001_211346_167_fsp.sph
-20051002_180339_169_fsp.sph
-20051002_210324_171_fsp.sph
-20051002_220651_174_fsp.sph
-20051003_180434_175_fsp.sph
-20051003_211042_178_fsp.sph
-20051003_220633_179_fsp.sph
-20051004_180351_180_fsp.sph
-20051004_180542_181_fsp.sph
-20051004_180730_182_fsp.sph
-20051004_200737_183_fsp.sph
-20051004_211611_185_fsp.sph
-20051005_180420_187_fsp.sph
-20051005_180709_188_fsp.sph
-20051005_213606_191_fsp.sph
-20051005_220917_192_fsp.sph
-20051005_230659_193_fsp.sph
-20051006_180416_194_fsp.sph
-20051006_180653_195_fsp.sph
-20051006_180815_196_fsp.sph
-20051006_181525_197_fsp.sph
-20051006_183153_199_fsp.sph
-20051006_210246_200_fsp.sph
-20051006_210417_201_fsp.sph
-20051006_220329_203_fsp.sph
-20051008_000036_208_fsp.sph
-20051008_180249_209_fsp.sph
-20051008_181720_210_fsp.sph
-20051008_183224_211_fsp.sph
-20051008_190256_212_fsp.sph
-20051008_211712_214_fsp.sph
-20051008_213416_215_fsp.sph
-20051009_180444_216_fsp.sph
-20051009_190753_218_fsp.sph
-20051009_220443_221_fsp.sph
-20051010_180650_222_fsp.sph
-20051010_182706_223_fsp.sph
-20051010_210622_224_fsp.sph
-20051010_222853_227_fsp.sph
-20051010_231630_228_fsp.sph
-20051011_181919_230_fsp.sph
-20051011_211026_232_fsp.sph
-20051011_220348_233_fsp.sph
-20051012_180233_234_fsp.sph
-20051012_190241_236_fsp.sph
-20051012_193952_237_fsp.sph
-20051012_224157_239_fsp.sph
-20051013_180458_240_fsp.sph
-20051013_180613_241_fsp.sph
-20051013_180700_242_fsp.sph
-20051013_182213_244_fsp.sph
-20051013_210221_245_fsp.sph
-20051013_210425_246_fsp.sph
-20051013_210941_247_fsp.sph
-20051013_220243_248_fsp.sph
-20051014_180259_249_fsp.sph
-20051014_180940_250_fsp.sph
-20051014_180948_251_fsp.sph
-20051014_183707_252_fsp.sph
-20051014_210348_253_fsp.sph
-20051014_210647_254_fsp.sph
-20051014_220227_256_fsp.sph
-20051014_230339_257_fsp.sph
-20051015_180549_258_fsp.sph
-20051015_190247_259_fsp.sph
-20051015_210138_260_fsp.sph
-20051015_210701_261_fsp.sph
-20051015_210831_262_fsp.sph
-20051016_180926_266_fsp.sph
-20051017_000346_269_fsp.sph
-20051017_210137_273_fsp.sph
-20051017_215732_274_fsp.sph
-20051018_180559_277_fsp.sph
-20051018_180816_278_fsp.sph
-20051018_211701_282_fsp.sph
-20051018_231046_283_fsp.sph
-20051018_235317_284_fsp.sph
-20051019_180448_285_fsp.sph
-20051019_183344_287_fsp.sph
-20051020_180339_293_fsp.sph
-20051020_180759_295_fsp.sph
-20051020_210218_297_fsp.sph
-20051020_212525_299_fsp.sph
-20051020_222944_300_fsp.sph
-20051020_234953_301_fsp.sph
-20051021_180218_302_fsp.sph
-20051021_180508_303_fsp.sph
-20051021_190605_304_fsp.sph
-20051021_210159_305_fsp.sph
-20051021_210530_306_fsp.sph
-20051021_222225_307_fsp.sph
-20051022_001311_309_fsp.sph
-20051022_180452_310_fsp.sph
-20051022_180829_312_fsp.sph
-20051022_190406_313_fsp.sph
-20051022_200517_314_fsp.sph
-20051022_210920_315_fsp.sph
-20051022_230324_316_fsp.sph
-20051022_232428_317_fsp.sph
-20051023_180342_318_fsp.sph
-20051023_180530_319_fsp.sph
-20051023_190301_321_fsp.sph
-20051023_210258_322_fsp.sph
-20051023_210605_323_fsp.sph
-20051023_223751_324_fsp.sph
-20051024_000348_326_fsp.sph
-20051024_180624_328_fsp.sph
-20051024_210748_330_fsp.sph
-20051024_211346_331_fsp.sph
-20051024_221753_332_fsp.sph
-20051024_230857_333_fsp.sph
-20051025_180351_334_fsp.sph
-20051025_210532_335_fsp.sph
-20051025_210959_336_fsp.sph
-20051025_220419_338_fsp.sph
-20051026_180611_340_fsp.sph
-20051026_190359_343_fsp.sph
-20051026_210334_344_fsp.sph
-20051026_211202_345_fsp.sph
-20051026_230956_347_fsp.sph
-20051026_234001_348_fsp.sph
-20051027_180217_349_fsp.sph
-20051027_210159_351_fsp.sph
-20051027_210333_352_fsp.sph
-20051027_211525_353_fsp.sph
-20051027_231329_354_fsp.sph
-20051028_180329_355_fsp.sph
-20051028_210350_358_fsp.sph
-20051028_211904_359_fsp.sph
-20051029_200218_363_fsp.sph
-20051029_210442_364_fsp.sph
-20051029_220538_366_fsp.sph
-20051030_000333_367_fsp.sph
-20051030_180521_368_fsp.sph
-20051030_181001_369_fsp.sph
-20051030_190231_370_fsp.sph
-20051030_210903_372_fsp.sph
-20051030_230444_373_fsp.sph
-20051031_180213_374_fsp.sph
-20051031_180906_375_fsp.sph
-20051031_210229_377_fsp.sph
-20051031_220447_379_fsp.sph
-20051101_153940_380_fsp.sph
-20051101_211314_384_fsp.sph
-20051101_223911_387_fsp.sph
-20051101_230216_388_fsp.sph
-20051102_175957_390_fsp.sph
-20051102_210243_394_fsp.sph
-20051102_210828_395_fsp.sph
-20051102_211130_396_fsp.sph
-20051103_163507_398_fsp.sph
-20051103_180920_400_fsp.sph
-20051103_185102_401_fsp.sph
-20051103_210539_403_fsp.sph
-20051103_223906_405_fsp.sph
-20051104_123901_407_fsp.sph
-20051104_180145_408_fsp.sph
-20051104_181437_409_fsp.sph
-20051104_190247_410_fsp.sph
-20051104_210307_411_fsp.sph
-20051104_210814_412_fsp.sph
-20051104_212121_413_fsp.sph
-20051104_222117_414_fsp.sph
-20051104_231424_416_fsp.sph
-20051105_175657_418_fsp.sph
-20051105_181203_419_fsp.sph
-20051105_210724_421_fsp.sph
-20051105_220745_422_fsp.sph
-20051106_180232_424_fsp.sph
-20051106_181321_425_fsp.sph
-20051106_190219_426_fsp.sph
-20051106_200213_427_fsp.sph
-20051106_210215_428_fsp.sph
-20051106_210310_429_fsp.sph
-20051106_211252_430_fsp.sph
-20051106_211804_431_fsp.sph
-20051106_215339_432_fsp.sph
-20051106_221653_433_fsp.sph
-20051107_115855_434_fsp.sph
-20051107_160351_435_fsp.sph
-20051107_180332_436_fsp.sph
-20051107_182401_437_fsp.sph
-20051107_210309_439_fsp.sph
-20051107_212723_440_fsp.sph
-20051108_145902_441_fsp.sph
-20051108_181424_442_fsp.sph
-20051108_210224_443_fsp.sph
-20051108_212018_444_fsp.sph
-20051109_180413_446_fsp.sph
-20051109_181432_447_fsp.sph
-20051109_181906_448_fsp.sph
-20051109_183631_449_fsp.sph
-20051109_210436_451_fsp.sph
-20051109_211151_452_fsp.sph
-20051109_212148_453_fsp.sph
-20051109_232505_454_fsp.sph
-20051110_155523_455_fsp.sph
-20051110_180208_456_fsp.sph
-20051110_180838_457_fsp.sph
-20051110_182221_459_fsp.sph
-20051110_182318_460_fsp.sph
-20051110_210200_461_fsp.sph
-20051110_210233_462_fsp.sph
-20051110_210454_463_fsp.sph
-20051110_211110_464_fsp.sph
-20051110_212818_466_fsp.sph
-20051110_225245_467_fsp.sph
-20051111_181441_471_fsp.sph
-20051111_184451_474_fsp.sph
-20051111_190326_475_fsp.sph
-20051111_194004_477_fsp.sph
-20051111_201357_478_fsp.sph
-20051111_230329_480_fsp.sph
-20051112_000305_482_fsp.sph
-20051112_165916_483_fsp.sph
-20051112_185651_487_fsp.sph
-20051112_190443_488_fsp.sph
-20051112_210205_489_fsp.sph
-20051112_210631_490_fsp.sph
-20051112_231502_491_fsp.sph
-20051113_180809_493_fsp.sph
-20051113_210908_497_fsp.sph
-20051113_220433_499_fsp.sph
-20051114_171942_502_fsp.sph
-20051114_181118_504_fsp.sph
-20051114_210412_506_fsp.sph
-20051114_212032_507_fsp.sph
-20051114_215057_508_fsp.sph
-20051114_220412_509_fsp.sph
-20051114_225557_510_fsp.sph
-20051115_134012_511_fsp.sph
-20051115_180301_512_fsp.sph
-20051115_181412_513_fsp.sph
-20051115_181731_514_fsp.sph
-20051115_182149_515_fsp.sph
-20051115_213551_517_fsp.sph
-20051115_215935_518_fsp.sph
-20051115_230749_520_fsp.sph
-20051116_000221_521_fsp.sph
-20051116_172353_522_fsp.sph
-20051116_180237_524_fsp.sph
-20051116_181228_525_fsp.sph
-20051116_181816_526_fsp.sph
-20051116_190450_527_fsp.sph
-20051116_210146_528_fsp.sph
-20051116_210553_529_fsp.sph
-20051116_211222_530_fsp.sph
-20051116_212312_531_fsp.sph
-20051116_222454_532_fsp.sph
-20051116_233038_533_fsp.sph
-20051117_001013_534_fsp.sph
-20051117_180234_535_fsp.sph
-20051117_181844_537_fsp.sph
-20051117_210156_538_fsp.sph
-20051117_210403_539_fsp.sph
-20051117_211540_540_fsp.sph
-20051117_211833_541_fsp.sph
-20051117_212855_542_fsp.sph
-20051117_213407_543_fsp.sph
-20051117_220412_544_fsp.sph
-20051117_225943_545_fsp.sph
-20051118_180619_547_fsp.sph
-20051118_180739_548_fsp.sph
-20051118_182114_549_fsp.sph
-20051118_182652_550_fsp.sph
-20051118_210212_551_fsp.sph
-20051118_210455_552_fsp.sph
-20051118_212058_553_fsp.sph
-20051118_212829_554_fsp.sph
-20051119_000355_555_fsp.sph
-20051119_181105_556_fsp.sph
-20051119_210802_557_fsp.sph
-20051119_212315_559_fsp.sph
-20051119_214926_560_fsp.sph
-20051120_181008_561_fsp.sph
-20051120_181339_562_fsp.sph
-20051120_190412_563_fsp.sph
-20051120_205645_565_fsp.sph
-20051120_210347_566_fsp.sph
-20051120_211526_567_fsp.sph
-20051121_181138_569_fsp.sph
-20051121_181357_570_fsp.sph
-20051121_190155_571_fsp.sph
-20051121_210922_573_fsp.sph
-20051122_181114_574_fsp.sph
-20051122_190326_576_fsp.sph
-20051122_210253_577_fsp.sph
-20051122_210703_578_fsp.sph
-20051122_211805_579_fsp.sph
-20051122_213037_580_fsp.sph
-20051122_215430_581_fsp.sph
-20051123_180926_582_fsp.sph
-20051123_181644_583_fsp.sph
-20051123_210214_584_fsp.sph
-20051123_211514_585_fsp.sph
-20051123_212412_586_fsp.sph
-20051123_213259_587_fsp.sph
-20051124_181720_588_fsp.sph
-20051124_190336_589_fsp.sph
-20051124_212221_591_fsp.sph
-20051124_220457_592_fsp.sph
-20051125_181632_593_fsp.sph
-20051125_190327_594_fsp.sph
-20051125_212150_595_fsp.sph
-20051126_181804_597_fsp.sph
-20051126_190347_598_fsp.sph
-20051126_210222_599_fsp.sph
-20051127_181335_601_fsp.sph
-20051127_190405_602_fsp.sph
-20051127_210516_603_fsp.sph
-20051127_211200_604_fsp.sph
-20051127_212516_605_fsp.sph
-20051128_215149_608_fsp.sph
-20051128_222007_609_fsp.sph
-20051129_180204_610_fsp.sph
-20051129_181241_612_fsp.sph
-20051129_181547_613_fsp.sph
-20051129_183449_614_fsp.sph
-20051129_190152_615_fsp.sph
-20051129_210218_616_fsp.sph
-20051129_210342_617_fsp.sph
-20051129_212711_618_fsp.sph
-20051130_181543_619_fsp.sph
-20051130_182626_620_fsp.sph
-20051130_210202_622_fsp.sph
-20051130_210910_623_fsp.sph
-20051130_212724_626_fsp.sph
-20051130_220121_627_fsp.sph
-20051130_221538_628_fsp.sph
-20051201_181034_630_fsp.sph
-20051201_181303_631_fsp.sph
-20051201_183429_632_fsp.sph
-20051201_191426_633_fsp.sph
-20051201_193415_634_fsp.sph
-20051201_195005_635_fsp.sph
-20051201_210713_636_fsp.sph
-20051201_212329_637_fsp.sph
-20051201_230640_638_fsp.sph
-20051202_181119_639_fsp.sph
-20051202_181659_640_fsp.sph
-20051202_182058_641_fsp.sph
-20051202_184713_642_fsp.sph
-20051202_190154_643_fsp.sph
-20051202_193515_644_fsp.sph
-20051202_210252_645_fsp.sph
-20051202_211824_646_fsp.sph
-20051202_212105_647_fsp.sph
-20051203_180701_649_fsp.sph
-20051203_182100_650_fsp.sph
-20051203_182132_651_fsp.sph
-20051203_182418_652_fsp.sph
-20051203_183501_653_fsp.sph
-20051203_190503_654_fsp.sph
-20051203_191125_655_fsp.sph
-20051203_210216_656_fsp.sph
-20051203_212114_658_fsp.sph
-20051203_222533_661_fsp.sph
-20051206_180753_662_fsp.sph
-20051206_180911_663_fsp.sph
-20051206_181649_664_fsp.sph
-20051206_183057_665_fsp.sph
-20051206_193937_667_fsp.sph
-20051206_201757_668_fsp.sph
-20051206_203158_669_fsp.sph
-20051206_210127_670_fsp.sph
-20051206_210744_671_fsp.sph
-20051206_211522_672_fsp.sph
-20051206_213252_673_fsp.sph
-20051206_214122_674_fsp.sph
-20051206_231328_675_fsp.sph
-20051207_180507_676_fsp.sph
-20051207_181020_677_fsp.sph
-20051207_190155_678_fsp.sph
-20051207_190426_679_fsp.sph
-20051207_193103_681_fsp.sph
-20051207_211858_683_fsp.sph
-20051207_212300_684_fsp.sph
-20051207_212831_685_fsp.sph
-20051207_214411_686_fsp.sph
-20051208_180208_687_fsp.sph
-20051208_180810_688_fsp.sph
-20051208_182430_689_fsp.sph
-20051208_190333_690_fsp.sph
-20051208_210609_691_fsp.sph
-20051208_211702_692_fsp.sph
-20051208_212444_694_fsp.sph
-20051208_214100_696_fsp.sph
-20051208_220606_697_fsp.sph
-20051209_180824_699_fsp.sph
-20051209_181542_700_fsp.sph
-20051209_181642_701_fsp.sph
-20051209_182541_702_fsp.sph
-20051209_182858_703_fsp.sph
-20051209_210136_704_fsp.sph
-20051209_210452_705_fsp.sph
-20051209_211542_706_fsp.sph
-20051209_212515_707_fsp.sph
-20051209_222427_709_fsp.sph
-20051209_231702_710_fsp.sph
-20051210_180659_711_fsp.sph
-20051210_181201_712_fsp.sph
-20051210_182013_713_fsp.sph
-20051210_182603_714_fsp.sph
-20051210_190201_715_fsp.sph
-20051210_210535_717_fsp.sph
-20051210_210735_718_fsp.sph
-20051211_000414_719_fsp.sph
-20051211_181346_720_fsp.sph
-20051211_182045_721_fsp.sph
-20051211_184252_723_fsp.sph
-20051211_190523_724_fsp.sph
-20051211_210240_725_fsp.sph
-20051211_211415_726_fsp.sph
-20051212_180251_727_fsp.sph
-20051212_181817_728_fsp.sph
-20051212_182453_729_fsp.sph
-20051212_190335_730_fsp.sph
-20051212_210527_731_fsp.sph
-20051212_210738_732_fsp.sph
-20051212_211419_733_fsp.sph
-20051212_213447_734_fsp.sph
-20051212_214512_735_fsp.sph
-20051213_180254_736_fsp.sph
-20051213_185913_737_fsp.sph
-20051213_191741_738_fsp.sph
-20051213_210120_739_fsp.sph
-20051213_211552_741_fsp.sph
-20051213_211953_742_fsp.sph
-20051213_221424_743_fsp.sph
-20051213_222016_744_fsp.sph
-20051214_193942_746_fsp.sph
-20051214_194606_747_fsp.sph
-20051214_201000_748_fsp.sph
-20051214_202717_749_fsp.sph
-20051214_211653_750_fsp.sph
-20051214_212318_751_fsp.sph
-20051214_212718_752_fsp.sph
-20051214_213225_753_fsp.sph
-20051215_180855_754_fsp.sph
-20051215_181731_755_fsp.sph
-20051215_182213_756_fsp.sph
-20051215_190143_757_fsp.sph
-20051215_190419_758_fsp.sph
-20051215_195526_759_fsp.sph
-20051215_200925_760_fsp.sph
-20051215_201639_761_fsp.sph
-20051215_203848_762_fsp.sph
-20051215_210410_764_fsp.sph
-20051215_212456_766_fsp.sph
-20051215_212701_767_fsp.sph
-20051215_212749_768_fsp.sph
-20051215_214814_769_fsp.sph
-20051215_220537_770_fsp.sph
-20051215_222306_771_fsp.sph
-20051216_181042_773_fsp.sph
-20051216_182340_774_fsp.sph
-20051216_191101_775_fsp.sph
-20051216_192823_776_fsp.sph
-20051216_200153_777_fsp.sph
-20051216_211423_778_fsp.sph
-20051216_220626_779_fsp.sph
-20051217_142547_780_fsp.sph
-20051217_180231_781_fsp.sph
-20051217_182026_783_fsp.sph
-20051217_182330_784_fsp.sph
-20051217_182530_785_fsp.sph
-20051217_183115_786_fsp.sph
-20051217_190226_787_fsp.sph
-20051218_142845_790_fsp.sph
-20051218_180353_791_fsp.sph
-20051218_181751_792_fsp.sph
-20051218_182127_793_fsp.sph
-20051218_182750_794_fsp.sph
-20051218_200401_799_fsp.sph
-20051218_210249_800_fsp.sph
-20051218_211820_801_fsp.sph
-20051218_212444_802_fsp.sph
-20051218_212813_803_fsp.sph
-20051219_180225_804_fsp.sph
-20051219_182110_806_fsp.sph
-20051219_190625_808_fsp.sph
-20051219_210655_812_fsp.sph
-20051219_212218_813_fsp.sph
-20051219_212716_814_fsp.sph
-20051219_213203_815_fsp.sph
-20051219_221213_816_fsp.sph
-20051219_223123_817_fsp.sph
-20051220_181731_820_fsp.sph
-20051220_190121_821_fsp.sph
-20051220_212400_826_fsp.sph
-20051220_212718_828_fsp.sph
-20051220_213420_829_fsp.sph
-20051221_000417_830_fsp.sph
-20051221_180958_831_fsp.sph
-20051221_210452_840_fsp.sph
-20051221_212325_841_fsp.sph
-20051221_212911_842_fsp.sph
-20051222_000436_843_fsp.sph
-20051222_181242_845_fsp.sph
-20051222_181506_846_fsp.sph
-20051222_182617_847_fsp.sph
-20051222_184209_849_fsp.sph
-20051222_200553_850_fsp.sph
-20051222_210309_852_fsp.sph
-20051222_212425_855_fsp.sph
-20051223_180346_856_fsp.sph
-20051223_181050_857_fsp.sph
-20051223_183105_860_fsp.sph
-20051223_212547_863_fsp.sph
-20051223_212853_864_fsp.sph
-20051224_180302_865_fsp.sph
-20051224_182949_867_fsp.sph
-20051224_210150_870_fsp.sph
-20051224_213010_871_fsp.sph
-20051225_192042_872_fsp.sph
-20051225_210556_873_fsp.sph
-20051226_180908_874_fsp.sph
-20051226_181659_875_fsp.sph
-20051227_181058_885_fsp.sph
-20051227_211308_887_fsp.sph
-20051227_213029_888_fsp.sph
-20051227_214843_889_fsp.sph
-20051227_220309_890_fsp.sph
-20051228_180249_891_fsp.sph
-20051228_182051_892_fsp.sph
-20051228_183955_893_fsp.sph
-20051228_210524_896_fsp.sph
-20051228_211808_897_fsp.sph
-20051228_212304_899_fsp.sph
-20051228_212734_900_fsp.sph
-20051228_223227_901_fsp.sph
-20051229_180231_902_fsp.sph
-20051229_182614_906_fsp.sph
-20051229_182631_907_fsp.sph
-20051229_214024_909_fsp.sph
-20051230_180457_910_fsp.sph
-20051230_181721_912_fsp.sph
-20051230_210412_913_fsp.sph
-20051230_210559_914_fsp.sph
-20051230_212557_915_fsp.sph
-20051231_000808_916_fsp.sph
-20060103_180314_917_fsp.sph
-20060103_182107_918_fsp.sph
-20060103_182257_919_fsp.sph
-20060103_182549_920_fsp.sph
-20060103_182654_921_fsp.sph
-20060103_184037_922_fsp.sph
-20060103_211504_925_fsp.sph
-20060103_211732_926_fsp.sph
-20060104_180509_928_fsp.sph
-20060104_181040_929_fsp.sph
-20060104_182115_930_fsp.sph
-20060104_182644_931_fsp.sph
-20060104_190448_933_fsp.sph
-20060104_192707_934_fsp.sph
-20060104_210223_935_fsp.sph
-20060104_212844_936_fsp.sph
-20060104_220148_937_fsp.sph
-20060105_202127_943_fsp.sph
-20060105_205957_944_fsp.sph
-20060105_210951_945_fsp.sph
-20060105_211743_946_fsp.sph
-20060105_213129_947_fsp.sph
-20060105_213243_948_fsp.sph
-20060105_230711_949_fsp.sph
-20060106_180202_950_fsp.sph
-20060106_181040_951_fsp.sph
-20060106_181726_952_fsp.sph
-20060106_182909_953_fsp.sph
-20060106_183056_954_fsp.sph
-20060106_183550_955_fsp.sph
-20060106_185224_956_fsp.sph
-20060106_193129_957_fsp.sph
-20060107_180634_960_fsp.sph
-20060107_181553_961_fsp.sph
-20060107_182715_962_fsp.sph
-20060107_190206_963_fsp.sph
-20060107_190415_964_fsp.sph
-20060107_210435_966_fsp.sph
-20060107_220739_967_fsp.sph
-20060108_180630_968_fsp.sph
-20060108_194731_971_fsp.sph
-20060108_234917_976_fsp.sph
-20060109_180448_977_fsp.sph
-20060109_182557_979_fsp.sph
-20060109_183636_980_fsp.sph
-20060109_183727_981_fsp.sph
-20060109_205815_982_fsp.sph
-20060109_213409_986_fsp.sph
-20060109_215138_987_fsp.sph
-20060109_220315_988_fsp.sph
-20060109_220535_989_fsp.sph
-20060110_183405_995_fsp.sph
-20060110_200611_998_fsp.sph
-20060110_210730_1002_fsp.sph
-20060110_213516_1004_fsp.sph
-20060110_221920_1006_fsp.sph
-20060110_230947_1007_fsp.sph
-20060111_181650_1008_fsp.sph
-20060111_182557_1009_fsp.sph
-20060111_184916_1010_fsp.sph
-20060111_192159_1012_fsp.sph
-20060111_200345_1013_fsp.sph
-20060111_210257_1014_fsp.sph
-20060111_212145_1016_fsp.sph
-20060111_213742_1017_fsp.sph
-20060111_213936_1018_fsp.sph
-20060111_230912_1020_fsp.sph
-20060112_180639_1021_fsp.sph
-20060112_182612_1022_fsp.sph
-20060112_183346_1023_fsp.sph
-20060112_183622_1024_fsp.sph
-20060112_210747_1025_fsp.sph
-20060112_211025_1026_fsp.sph
-20060112_221010_1027_fsp.sph
-20060112_221022_1028_fsp.sph
-20060113_180159_1030_fsp.sph
-20060113_183452_1033_fsp.sph
-20060113_190403_1034_fsp.sph
-20060113_213733_1036_fsp.sph
-20060114_181137_1039_fsp.sph
-20060114_181922_1040_fsp.sph
-20060114_191056_1043_fsp.sph
-20060114_213242_1044_fsp.sph
-20060115_180421_1045_fsp.sph
-20060115_183525_1047_fsp.sph
-20060115_210217_1048_fsp.sph
-20060115_212231_1051_fsp.sph
-20060115_220504_1052_fsp.sph
-20060115_232345_1053_fsp.sph
-20060116_181908_1054_fsp.sph
-20060116_182500_1055_fsp.sph
-20060116_183201_1056_fsp.sph
-20060116_184141_1057_fsp.sph
-20060116_202324_1058_fsp.sph
-20060116_204753_1059_fsp.sph
-20060116_210217_1060_fsp.sph
-20060116_211237_1061_fsp.sph
-20060116_212845_1063_fsp.sph
-20060116_220652_1064_fsp.sph
-20060116_221118_1065_fsp.sph
-20060117_181936_1068_fsp.sph
-20060117_182604_1069_fsp.sph
-20060117_185153_1071_fsp.sph
-20060117_210138_1072_fsp.sph
-20060117_210311_1073_fsp.sph
-20060117_212546_1074_fsp.sph
-20060118_180229_1076_fsp.sph
-20060118_180647_1078_fsp.sph
-20060118_182448_1079_fsp.sph
-20060118_183010_1080_fsp.sph
-20060118_190231_1082_fsp.sph
-20060118_200148_1083_fsp.sph
-20060118_205216_1084_fsp.sph
-20060118_212907_1085_fsp.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test
deleted file mode 100644
index 0cbc3cc95fd..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/test
+++ /dev/null
@@ -1,20 +0,0 @@
-sp_0053.sph
-sp_0082.sph
-sp_0084.sph
-sp_0088.sph
-sp_0681.sph
-sp_0699.sph
-sp_0776.sph
-sp_0857.sph
-sp_1031.sph
-sp_1100.sph
-sp_1148.sph
-sp_1156.sph
-sp_1186.sph
-sp_1212.sph
-sp_1345.sph
-sp_1435.sph
-sp_1578.sph
-sp_1648.sph
-sp_1807.sph
-sp_1847.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train b/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train
deleted file mode 100644
index 2c936072534..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/splits/train
+++ /dev/null
@@ -1,80 +0,0 @@
-sp_0085.sph
-sp_0096.sph
-sp_0098.sph
-sp_0100.sph
-sp_0291.sph
-sp_0713.sph
-sp_0724.sph
-sp_0726.sph
-sp_0731.sph
-sp_0733.sph
-sp_0753.sph
-sp_0788.sph
-sp_0826.sph
-sp_0831.sph
-sp_0836.sph
-sp_0841.sph
-sp_0850.sph
-sp_0855.sph
-sp_0892.sph
-sp_0899.sph
-sp_0910.sph
-sp_0917.sph
-sp_0919.sph
-sp_0923.sph
-sp_0945.sph
-sp_0950.sph
-sp_0951.sph
-sp_0992.sph
-sp_0997.sph
-sp_1013.sph
-sp_1039.sph
-sp_1044.sph
-sp_1045.sph
-sp_1058.sph
-sp_1060.sph
-sp_1063.sph
-sp_1081.sph
-sp_1106.sph
-sp_1122.sph
-sp_1140.sph
-sp_1175.sph
-sp_1195.sph
-sp_1198.sph
-sp_1231.sph
-sp_1234.sph
-sp_1255.sph
-sp_1260.sph
-sp_1261.sph
-sp_1262.sph
-sp_1264.sph
-sp_1266.sph
-sp_1273.sph
-sp_1275.sph
-sp_1284.sph
-sp_1286.sph
-sp_1304.sph
-sp_1308.sph
-sp_1333.sph
-sp_1341.sph
-sp_1353.sph
-sp_1368.sph
-sp_1379.sph
-sp_1384.sph
-sp_1449.sph
-sp_1463.sph
-sp_1574.sph
-sp_1740.sph
-sp_1759.sph
-sp_1849.sph
-sp_1908.sph
-sp_1915.sph
-sp_1918.sph
-sp_1974.sph
-sp_1976.sph
-sp_1988.sph
-sp_2000.sph
-sp_2056.sph
-sp_2070.sph
-sp_2091.sph
-sp_2101.sph
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl b/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl
deleted file mode 100755
index 03193384670..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/spron.pl
+++ /dev/null
@@ -1,304 +0,0 @@
-#!/usr/bin/env perl
-
-# Oct 21, 2015 : Gaurav Kumar (Johns Hopkins University)
-# GNU General Public License, v3.0
-#
-# This script was modified under GPL and is being distributed with 
-# Kaldi. It requires the preference and rule files
-# (under LDC copyright) from LDC96L16. The main changes were
-# - Outdated usage of perl conventions updated @_ => $_ or @A
-# - This script no longer needs the preference and rule files to 
-#   be in the same directory as this script.
-# - Accepts tokens from <STDIN> instead of <>
-
-# --- Retained previous version information ----------------------------
-# spron.pl Version 0.1 Jan. 11 1995 
-# Written by Zhibiao Wu, LDC, wzb@unagi.cis.upenn.edu
-# This program needs the basic_rules file to run. The rules must be sorted 
-# in alphabetical order. The most specific rules should precede the more 
-# general ones. The conventions used in the basic rules are the same as 
-# regular expressions used in Perl.
-
-# Revised history: Feb. 10 1995
-
-# The file "preferences" (assumed to be in your current directory)
-# gives an "oracle" of correct pronunciations that override the
-# machine-generated ones.
-
-# slightly changed 97/09/05 robertm:
-#  - look for basic_rules and preferences in $PWD instead of ~wzb/...
-#  - use next to shortcut loop instead of if/else
-#  - added a bit of documentation, without really trying to decipher this thing
-# -----------------------------------------------------------------------
-
-use utf8;
-binmode(STDIN, ":utf8");
-binmode(STDOUT, ":utf8");
-
-$vfile = "";
-$preference_file = "";
-$rules_file = "";
-$print_input = 0;
-if ($#ARGV < 1) {
-  # Print Usage
-  print "Usage : local/spron.pl pref-file rules-file <v-file> <print-input>\n";
-  exit 1;
-} else {
-  $preference_file = $ARGV[0];
-  $rules_file = $ARGV[1];
-  if ($#ARGV > 1) {
-    $vfile = $ARGV[2];
-  }
-  if ($#ARGV > 2) {
-    $print_input = 1;
-  }
-}
-
-$rule_num = 0;
-$previous = "";
-if ($vfile ne "") {
-  open(VF, $vfile) || die "Can't find file $vfile!\n";
-  while (<VF>) {
-    chop;
-    @A = split(//);
-    if (($A[0] ne '#') && ($_ ne "")) {
-      if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) {
-        $head[$rule_num] = $1;
-        $end[$rule_num] = $2;
-        $pre[$rule_num] = $3;
-        if ($4 =~ /#/) {
-          $nex[$rule_num] = "";
-          $some[$rule_num] = $4;
-        } else {
-          $nex[$rule_num] = $4;
-          $some[$rule_num] = $5;
-        }
-        if ($previous ne substr($head[$rule_num],0,1)) {
-          $first{$head[$rule_num]} = $rule_num;
-          $last{$previous} = $rule_num - 1;
-        }
-        $previous = substr($head[$rule_num++],0,1);
-      } else {
-        print "Rule format error: Cannot parse $_\n";
-        exit(1);
-      }
-    }
-  }
-  $last{$previous} = $rule_num - 1;
-
-  close(VF);
-}
-
-open(PF, $preference_file) || die "Can't read `preferences' file";
-binmode(PF, ":iso88591");
-while (<PF>) {
-  chop;
-  if ($_ ne "") {
-    @A = split;
-    $pron{$A[0]} = $A[1];
-    $stre{$A[0]} = $A[2];
-  }
-}
-
-$previous = "";
-$brule_num = 0;
-open(BF, $rules_file) || die "Can't read `basic_rules' file";
-binmode(BF, ":iso88591");
-while (<BF>) {
-  chop;
-  @A = split(//);
-  if (($A[0] ne '#') && ($_ ne "")) {
-    if (/(\S+)\s*->\s*(\S*)\s*:\s*(\S*)\s*__\s*(\S*)\s*(#?)/) {
-      $bhead[$brule_num] = $1;
-      $bend[$brule_num] = $2;
-      $bpre[$brule_num] = $3;
-      if ($4 =~ /#/) {
-        $bnex[$brule_num] = "";
-        $bsome[$brule_num] = $4;
-      } else {
-        $bnex[$brule_num] = $4;
-        $bsome[$brule_num] = $5;
-      }
-      if ($previous ne substr($bhead[$brule_num],0,1)) {
-        $bfirst{substr($bhead[$brule_num],0,1)} = $brule_num;
-        $blast{$previous} = $brule_num - 1;
-      }
-      $previous = substr($bhead[$brule_num++],0,1);
-    } else {
-      print "Rule format error in file basic_rules: Cannot parse $_\n";
-      exit(1);
-    }
-  }
-}
-$blast{$previous} = $brule_num - 1;
-close(BF);
-
-if ($brule_num == 0) {
-  print "No basic rules, Program exit!\n";
-  exit(1);
-}
-
-while(<STDIN>){
-  next if ((/^#/) || (/^\s*$/) );
-  chop;
-  if ($print_input) {
-    print $_, "\t";
-  }
-  if ($pron{$_}) {
-    # print answer from preferences and skip to next word
-    print "$pron{$_}\t$stre{$_}\n";
-    next;
-  }
-  $original = $_;
-  tr/A-ZÁÉÍÓÚÏÜÑ/a-záéíóúïüñ/;
-  $orig = "#" . $_ . "#";
-
-  @l = ();
-
-  push(@l,split("",$orig));
-
-  @pron = &transfer(1);
-
-  foreach (@pron) {
-    $a = $_;
-    y/aeiouáéíóú//cd;
-    if ($_ eq "") {
-      print "#No stressable vowel in $original\n";
-    } else {
-      s/[aeiou]/0/go;
-      s/[áéíóú]/1/go;
-      if (!/1/) {
-        if(length() == 1){
-          s/\b./1/o;
-        } elsif($l[$#l - 1] =~ /[aeiouns]/o){
-          s/00\b/10/o;
-        } else {
-          s/0\b/1/o;
-        }
-      }
-
-      $a =~ s/á/a/g;
-      $a =~ s/é/e/g;
-      $a =~ s/í/i/g;
-      $a =~ s/ó/o/g;
-      $a =~ s/ú/u/g;
-
-      print "$a\t$_\n";
-    }
-  }
-}
-
-sub transfer{
-  local($_) = @_;
-  local(@p) = ();
-  local($s) = 0;
-  local($over) = 0;
-  local($i,$j,$k) = (0,0,0);
-
-  if ($_ >= length($orig) - 1) {
-    push(@p, "");
-    return(@p);
-  } else {
-
-    if ($vfile ne "") {
-      for ($i=   $first{substr($orig, $_, 1)}; 
-        $i <= $last{substr($orig, $_, 1)} ; $i++) {
-        if (&matchv($_,$i)) {
-          $s = $_ + length($head[$i]);
-          foreach $w (&transfer($s)) {
-            push(@p, $end[$i] . $w);
-            if ($some[$i] ne "") {
-              $over = 0;
-            } else {
-              $over = 1;
-            }
-          }
-        }
-      }
-    }
-
-    if ($over == 0 ) {
-      $i = $bfirst{substr($orig, $_, 1)}; 
-      while (($i <= $blast{substr($orig, $_, 1)}) && ($over == 0)) {
-        if (&matchb($_,$i)) {
-          $over = 1;
-          $s = $_ + length($bhead[$i]);
-          foreach $w (&transfer($s)) {
-            push(@p, $bend[$i] . $w);
-          }
-        }
-        $i++;
-      }
-      if ($over == 0) {
-        $s = $_ + 1;
-        foreach $w (&transfer($s)) {
-          push(@p, substr($orig,$_,1) . $w);
-        }
-      } 
-    }
-
-    return(@p);
-  }
-}
-
-sub matchv {
-  $h = $head[$_[1]];
-  $p = $pre[$_[1]];
-  $n = $nex[$_[1]];
-
-  return(&match($_[0],$h,$p,$n));
-
-}
-
-sub matchb {
-  $h = $bhead[$_[1]];
-  $p = $bpre[$_[1]];
-  $n = $bnex[$_[1]];
-
-  return(&match($_[0],$h,$p,$n));
-
-}
-
-sub match {
-
-  if (substr($orig, $_[0], length($_[1])) eq $_[1]) {
-    return ( &match_n($_[0] + length($_[1]) - 1, $_[3]) && 
-      &match_p($_[0], $_[2])); 
-  } else {
-    return (0);
-  }
-}
-
-sub match_p {
-  local($a) = $_[0];
-  local($b) = $_[1];
-  local($_);
-
-  if ($b eq "" ) {
-    return (1);
-  } else {
-    $_ = substr($orig, 0, $a) . "!";  
-    if (/($b)!/) {
-      return(1);
-    } else {
-      return(0);
-    }
-  }
-}
-
-sub match_n {
-  local($a) = $_[0];
-  local($b) = $_[1];
-  local($_);
-
-  if ($b eq "" ) {
-    return (1);
-  } else {
-    $_ = "!" . substr($orig, $a + 1, length($orig) - $a - 1);  
-    if (/!($b)/) {
-      return(1);
-    } else {
-      return(0);
-    }
-  }
-}
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh
deleted file mode 100755
index 9f5855d56c4..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/subset_data_prep.sh
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-# The input is a subset of the dataset in use. (*.sph files) 
-# In addition the transcripts are needed as well. 
-# This script is only called internally and should not be 
-# used for any other purpose. A similar script for general usage 
-# is local/fsp_data_prep.sh
-# To be run from one directory above this script.
-
-stage=0
-
-export LC_ALL=C
-
-
-if [ $# -lt 4 ]; then
-   echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories and the name of this partition
-, and a list of files that belong to this partition . see ../run.sh for example."
-   exit 1;
-fi
-
-subset=$3
-dir=`pwd`/data/local/$subset/data
-mkdir -p $dir
-local=`pwd`/local
-utils=`pwd`/utils
-tmpdir=`pwd`/data/local/tmp
-mkdir -p $tmpdir
-
-. ./path.sh || exit 1; # Needed for KALDI_ROOT
-export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
-if [ ! -x $sph2pipe ]; then
-   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
-   exit 1;
-fi
-cd $dir
-
-# Make directory of links to the WSJ disks such as 11-13.1.  This relies on the command
-# line arguments being absolute pathnames.
-rm -r links/ 2>/dev/null
-mkdir links/
-mkdir links/speech
-mkdir links/transcripts
-if [ ! -f $4 ]; then
-  echo "Please specify a valid parition file. Could not find $4"
-  exit 1;
-fi
-cat $4 | sed 's:.*/::g' | \
-xargs -I % find $1/ -name %* | xargs -I % echo cp % links/
-
-# Basic spot checks to see if we got the data that we needed
-if [ ! -d links/LDC2010S01 -o ! -d links/LDC2010T04 ];
-then
-        echo "The speech and the data directories need to be named LDC2010S01 and LDC2010T04 respecti
-vely"
-        exit 1;
-fi
-
-if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ];
-then
-        echo "Disc 1 and 2 directories missing or not properly organised within the speech data dir"
-        echo "Typical format is LDC2010S01/DISC?/data/speech"
-        exit 1;
-fi
-
-#Check the transcripts directories as well to see if they exist
-if [ ! -d links/LDC2010T04/data/transcripts ];
-then
-        echo "Transcript directories missing or not properly organised"
-        echo "Typical format is LDC2010T04/data/transcripts"
-        exit 1;
-fi
-
-speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
-speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
-transcripts=$dir/links/LDC2010T04/data/transcripts                                 
-                                                                                   
-fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`                                             
-fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`                                             
-fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`                                            
-#TODO:it seems like not all speech files have transcripts             
-#Now check if we got all the files that we needed
-if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];                 
-then                                                                               
-        echo "Incorrect number of files in the data directories"                   
-        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"  
-        echo "The transcripts should contain 819 files"                            
-        exit 1;                                                                    
-fi   
-
-if [ $stage -le 0 ]; then
-  #Gather all the speech files together to create a file list
-  #TODO: Train and test split might be required
-  (
-      find $speech_d1 -iname '*.sph';
-      find $speech_d2 -iname '*.sph';
-  )  > $tmpdir/train_sph.flist
-
-  #Get all the transcripts in one place
-  find $transcripts -iname '*.tdf' > $tmpdir/train_transcripts.flist
-fi
-
-if [ $stage -le 1 ]; then
-  $local/fsp_make_trans.pl $tmpdir
-  mkdir -p $dir/train_all
-  mv $tmpdir/reco2file_and_channel $dir/train_all/
-fi
-
-if [ $stage -le 2 ]; then                                                        
-  sort $tmpdir/text.1 | grep -v '((' | \
-  awk '{if (NF > 1){ print; }}' | \
-  sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \
-  sed 's:<lname>\([^<]*\)<\/lname>:\1:g' | \
-  sed 's:<lname[\/]*>::g' | \
-  sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
-  sed 's:<\s*cough[\/]*>:[noise]:g' | \
-  sed 's:<sneeze[\/]*>:[noise]:g' | \
-  sed 's:<breath[\/]*>:[noise]:g' | \
-  sed 's:<lipsmack[\/]*>:[noise]:g' | \
-  sed 's:<background>[^<]*<\/background>:[noise]:g' | \
-  sed -r 's:<[/]?background[/]?>:[noise]:g' | \
-  #One more time to take care of nested stuff
-  sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
-  sed -r 's:<[/]?laugh[/]?>:[laughter]:g' | \
-  #now handle the exceptions, find a cleaner way to do this?
-  sed 's:<foreign langenglish::g' | \
-  sed 's:</foreign::g' | \
-  sed -r 's:<[/]?foreing\s*\w*>::g' | \
-  sed 's:</b::g' | \
-  sed 's:<foreign langengullís>::g' | \
-  sed 's:foreign>::g' | \
-  sed 's:>::g' | \
-  #How do you handle numbers?
-  grep -v '()' | \
-  #Now go after the non-printable characters
-  sed -r 's:¿::g' > $tmpdir/text.2
-  cp $tmpdir/text.2 $dir/train_all/text
-
-  #Create segments file and utt2spk file
-  ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \
-  && echo "Error producing utt2spk file" && exit 1;
-
-  cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2;
- $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' >$dir/train_all/segments
-
-  $utils/utt2spk_to_spk2utt.pl <$dir/train_all/utt2spk > $dir/train_all/spk2utt
-fi
-
-if [ $stage -le 3 ]; then
-  cat $tmpdir/train_sph.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; ";  print "$1 $_"; ' > $tmpdir/sph.scp
-  cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
-  sort -k1,1 -u  > $dir/train_all/wav.scp || exit 1;
-fi
-
-if [ $stage -le 4 ]; then
-  # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl.
-  cat $tmpdir/spk2gendertmp | sort | uniq > $dir/train_all/spk2gender
-fi
-
-echo "Fisher Spanish Data preparation succeeded."
-
-exit 1;
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py
deleted file mode 100755
index ce83fa8c8aa..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_1_best.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-import os
-import sys
-
-files = [
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/exp/tri5a/decode_test/scoring/13.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/exp/tri5a/decode_test/scoring/13.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/exp/tri5a/decode_test/scoring/13.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/exp/tri5a/decode_test/scoring/13.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/exp/tri5a/decode_test/scoring/13.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/exp/tri5a/decode_test/scoring/13.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/exp/tri5a/decode_test/scoring/13.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/exp/tri5a/decode_test/scoring/13.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/exp/tri5a/decode_test/scoring/13.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/exp/tri5a/decode_test/scoring/13.tra')]
-
-def findTranscription(timeDetail):
-    
-    for file1 in files:
-        file1.seek(0,0)
-        for line in file1:
-            lineComp = line.split()
-            if lineComp[0] == timeDetail:
-                return " ".join(lineComp[1:])
-    # No result found
-    return -1
-
-
-wordsFile = open('exp/tri5a/graph/words.txt')
-words = {}
-
-# Extract word list
-for line in wordsFile:
-    lineComp = line.split()
-    words[int(lineComp[1])] = lineComp[0].strip()
-
-# Now read list of files in conversations
-fileList = []
-#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev')
-conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train')
-for line in conversationList: 
-    line = line.strip()
-    line = line[:-4]
-    fileList.append(line)
-
-# IN what order were the conversations added to the spanish files?
-# TODO: Make sure they match the order in which these english files are being written
-
-# Now get timing information to concatenate the ASR outputs
-if not os.path.exists('exp/tri5a/one-best/train'):
-    os.makedirs('exp/tri5a/one-best/train')
-
-#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+')
-provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train', 'w+')
-for item in fileList:
-    timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es')
-    newFile = open('exp/tri5a/one-best/train/' + item + '.es', 'w+')
-    for line in timingFile:
-        timeInfo = line.split()
-        mergedTranslation = ""
-        for timeDetail in timeInfo:
-            #Locate this in ASR dev/test, this is going to be very slow
-            tmp = findTranscription(timeDetail)
-            if tmp != -1:
-                mergedTranslation = mergedTranslation + " " + tmp
-        mergedTranslation = mergedTranslation.strip()
-        transWords = [words[int(x)] for x in mergedTranslation.split()]
-        newFile.write(" ".join(transWords) + "\n")
-        provFile.write(" ".join(transWords) + "\n")
-    newFile.close()
-provFile.close()
-
-
-
-
-
-   
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py
deleted file mode 100755
index b9f906b27da..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_get_lattices.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-from __future__ import print_function
-import os
-import sys
-import subprocess
-
-latticeLocation = {1:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/latjosh-2/lattices-pushed/",
-2:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/latjosh-2/lattices-pushed/",
-3:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/latjosh-2/lattices-pushed/",
-4:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/latjosh-2/lattices-pushed/",
-5:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/latjosh-2/lattices-pushed/",
-6:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/latjosh-2/lattices-pushed/",
-7:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/latjosh-2/lattices-pushed/",
-8:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/latjosh-2/lattices-pushed/",
-9:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/latjosh-2/lattices-pushed/",
-10:"/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/latjosh-2/lattices-pushed/"}
-
-latticeDict = {}
-
-for key,location in latticeLocation.items():
-    for root, dirs, filenames in os.walk(location):
-        for f in filenames:
-            latticeDict[f] = str(key)
-
-tmpdir = 'data/local/data/tmp/lattmp'
-if not os.path.exists(tmpdir):
-    os.makedirs(tmpdir)
-invalidplfdir = 'data/local/data/tmp/invalidplf'
-if not os.path.exists(invalidplfdir):
-    os.makedirs(invalidplfdir)
-else:
-    os.system("rm " + invalidplfdir + "/*")
-
-def latticeConcatenate(lat1, lat2):
-    '''
-    Concatenates lattices, writes temporary results to tmpdir
-    '''
-    if lat1 == "":
-        if os.path.exists('rm ' + tmpdir + '/tmp.lat'):
-            os.system('rm ' + tmpdir + '/tmp.lat')
-        return lat2
-    else:
-        proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')])
-        proc.wait()
-        return tmpdir + '/tmp.lat'
-
-
-def findLattice(timeDetail):
-    '''
-    Finds the lattice corresponding to a time segment
-    '''
-    searchKey = timeDetail + '.lat'
-    if searchKey in latticeDict:
-        return "/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-" + latticeDict[searchKey] + "/latjosh-2/lattices-pushed/" + searchKey
-    else:
-        return -1
-
-
-# Now read list of files in conversations
-fileList = []
-conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train')
-for line in conversationList:
-    line = line.strip()
-    line = line[:-4]
-    fileList.append(line)
-
-# IN what order were the conversations added to the spanish files?
-# Now get timing information to concatenate the ASR outputs
-
-provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train.plf', 'w+')
-lineNo = 1
-invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/invalidPLF', 'w+')
-blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/blankPLF', 'w+')
-rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/removeLines', 'w+')
-for item in fileList:
-    timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es')
-    for line in timingFile:
-        timeInfo = line.split()
-
-        # For utterances that are concatenated in the translation file, 
-        # the corresponding FSTs have to be translated as well
-        mergedTranslation = ""
-        for timeDetail in timeInfo:
-            tmp = findLattice(timeDetail)
-            if tmp != -1:
-                # Concatenate lattices
-                mergedTranslation = latticeConcatenate(mergedTranslation, tmp)
-
-        if mergedTranslation != "":
-            
-            # Sanjeev's Recipe : Remove epsilons and topo sort
-            finalFST = tmpdir + "/final.fst"
-            os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST)
-        
-            # Now convert to PLF
-            proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh /export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt ' + finalFST, stdout=subprocess.PIPE, shell=True)
-            PLFline = proc.stdout.readline()
-            finalPLFFile = tmpdir + "/final.plf"
-            finalPLF = open(finalPLFFile, "w+")
-            finalPLF.write(PLFline)
-            finalPLF.close()
-
-            # now check if this is a valid PLF, if not write it's ID in a 
-            # file so it can be checked later
-            proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
-            line = proc.stdout.readline()
-            print("{} {}".format(line, lineNo))
-            if line.strip() != "PLF format appears to be correct.":
-                os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
-                invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
-                rmLines.write("{}\n".format(lineNo))
-            else:
-                provFile.write(PLFline)
-        else:
-            blankPLF.write(timeInfo[0] + "\n")
-            rmLines.write("{}\n".format(lineNo))
-        # Now convert to PLF
-        lineNo += 1
-
-provFile.close()
-invalidPLF.close()
-blankPLF.close()
-rmLines.close()
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh b/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
deleted file mode 100755
index b8b3ca35ef9..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_pocolm.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-stage=-2
-num_words_pocolm=110000
-prune_size=1000000
-
-. ./path.sh
-. ./cmd.sh
-. ./utils/parse_options.sh
-
-set -euo pipefail
-
-export POCOLM_ROOT=$(cd $KALDI_ROOT/tools/pocolm/; pwd -P)
-export PATH=$PATH:$POCOLM_ROOT/scripts
-
-textdir=$1
-pocolm_dir=$2
-
-
-if [ $stage -le -2 ]; then
-    echo "****"
-    echo " POCOLM experiment : Running STAGE 1 : 2-gram Pocolm general closed vocabulary model"
-    echo " Will estimate the metaparams to be used as unigram weights for stage 2 ....."
-    echo "****"
-    if [ -e "$textdir"/unigram_weights ]; then
-	rm "$textdir"/unigram_weights
-    fi
-    if [ -e "$pocolm_dir" ]; then
-	rm -r "$pocolm_dir"
-    fi
-    
-    bash local/pocolm_cust.sh  --num-word 0 --ngram-order 2 --pocolm-stage 1 --lm-dir "$pocolm_dir"/lm \
-	 --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
-
-fi
-    
-if [ $stage -le -1 ];then
-    echo "********"
-    echo "POCOLM experiment : RUNNING STAGE 2 : 3gram POCOLM using unigram wts estimates in 1st stage....."
-    echo "********"
-
-    echo " " > "$pocolm_dir"/lm/work/.unigram_weights.done
-    python local/get_unigram_weights_vocab.py "$pocolm_dir"/lm/0_2.pocolm/ "$textdir"/unigram_weights
-    bash local/pocolm_cust.sh  --num-word "$num_words_pocolm"  --lm-dir "$pocolm_dir"/lm \
-	                       --arpa-dir "$pocolm_dir"/arpa --textdir "$textdir"
-    prune_lm_dir.py --target-num-ngrams=$prune_size "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm \
-                        "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"
-    mkdir -p "$pocolm_dir"/arpa
-    format_arpa_lm.py "$pocolm_dir"/lm/"$num_words_pocolm"_3.pocolm_pruned_"$prune_size"  | \
-                                gzip -c > "$pocolm_dir"/arpa/"$num_words_pocolm"_3_pruned_"$prune_size".arpa.gz    
-fi
-
-
-exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py b/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py
deleted file mode 100755
index 3f6444da294..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/train_process_oracle.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-
-import os
-import sys
-
-files = [
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/exp/tri5a/decode_test/oracle/oracle.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/exp/tri5a/decode_test/oracle/oracle.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/exp/tri5a/decode_test/oracle/oracle.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/exp/tri5a/decode_test/oracle/oracle.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/exp/tri5a/decode_test/oracle/oracle.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/exp/tri5a/decode_test/oracle/oracle.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/exp/tri5a/decode_test/oracle/oracle.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/exp/tri5a/decode_test/oracle/oracle.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/exp/tri5a/decode_test/oracle/oracle.tra'),
-open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/exp/tri5a/decode_test/oracle/oracle.tra')]
-
-def findTranscription(timeDetail):
-
-    for file1 in files:
-        file1.seek(0,0)
-        for line in file1:
-            lineComp = line.split()
-            if lineComp[0] == timeDetail:
-                return " ".join(lineComp[1:])
-    # No result found
-    return -1
-
-
-wordsFile = open('exp/tri5a/graph/words.txt')
-words = {}
-
-# Extract word list
-for line in wordsFile:
-    lineComp = line.split()
-    words[int(lineComp[1])] = lineComp[0].strip()
-
-# Now read list of files in conversations
-fileList = []
-#conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev')
-conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train')
-for line in conversationList:
-    line = line.strip()
-    line = line[:-4]
-    fileList.append(line)
-
-# IN what order were the conversations added to the spanish files?
-# TODO: Make sure they match the order in which these english files are being written
-
-# Now get timing information to concatenate the ASR outputs
-if not os.path.exists('exp/tri5a/one-best/train'):
-    os.makedirs('exp/tri5a/one-best/train')
-
-#provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+')
-provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train.oracle', 'w+')
-for item in fileList:
-    timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es')
-    newFile = open('exp/tri5a/one-best/train/' + item + '.es', 'w+')
-    for line in timingFile:
-        timeInfo = line.split()
-        mergedTranslation = ""
-        for timeDetail in timeInfo:
-            #Locate this in ASR dev/test, this is going to be very slow
-            tmp = findTranscription(timeDetail)
-            if tmp != -1:
-                mergedTranslation = mergedTranslation + " " + tmp
-        mergedTranslation = mergedTranslation.strip()
-        transWords = [words[int(x)] for x in mergedTranslation.split()]
-        newFile.write(" ".join(transWords) + "\n")
-        provFile.write(" ".join(transWords) + "\n")
-    newFile.close()
-provFile.close()
-
-
-
-
-
-
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter b/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter
deleted file mode 100755
index 4fce42945b3..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/local/wer_output_filter
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/sed -f
-s:\[laughter\]::g
-s:\[noise\]::g
-s:\[oov\]::g
-s:<UNK>::g
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/path.sh b/egs/fisher_callhome_spanish/s5_gigaword/path.sh
deleted file mode 100755
index 2993311fd90..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-export KALDI_ROOT=`pwd`/../../../
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
-export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
-[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
-. $KALDI_ROOT/tools/config/common_path.sh
-export LD_LIBRARY_PATH=/home/dpovey/libs
-
-export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
-export PATH=$SPARROWHAWK_ROOT/bin:$PATH
-export LC_ALL=C
-export LANG=C
-
-source ~/anaconda/bin/activate py36
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/rnnlm b/egs/fisher_callhome_spanish/s5_gigaword/rnnlm
deleted file mode 120000
index fb754622d5e..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/rnnlm
+++ /dev/null
@@ -1 +0,0 @@
-../../wsj/s5/rnnlm
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/run.sh b/egs/fisher_callhome_spanish/s5_gigaword/run.sh
deleted file mode 100755
index 95425c29034..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/run.sh
+++ /dev/null
@@ -1,310 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2018  Nagendra Goel, Saikiran Valluri  Apache 2.0
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
-# Recipe for Fisher/Callhome-Spanish
-
-stage=-1
-lmstage=-2
-train_rnnlm=false
-start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder.
-                        # If you already have the normalised gigword text somewhere, you can bypass the
-			# time consuming text cleanup (~1 week) by setting this option false.
-addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to 
-                  # perform the A, A + G, Dev type POCOLM training configuration.
-		  # A=fsp train, G=gigword text, 
-num_words_pocolm=110000
-train_sgmm2=false
-
-# call the next line with the directory where the Spanish Fisher data is
-# (the values below are just an example).
-sfisher_speech=/export/corpora/LDC/LDC2010S01
-sfisher_transcripts=/export/c03/svalluri//LDC2010T04
-spanish_lexicon=/export/corpora/LDC/LDC96L16
-split=local/splits/split_fisher
-
-callhome_speech=/export/corpora/LDC/LDC96S35
-callhome_transcripts=/export/corpora/LDC/LDC96T17
-split_callhome=local/splits/split_callhome
-
-gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data
-rnnlm_workdir=workdir_rnnlm_Spanish_08032019
-mfccdir=`pwd`/mfcc
-
-. ./cmd.sh
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
-
-set -eou pipefail
-
-if [ $stage -le -1 ]; then
-  local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
-
-  local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
-
-  # The lexicon is created using the LDC spanish lexicon, the words from the
-  # fisher spanish corpus. Additional (most frequent) words are added from the
-  # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
-  # wordlist is downloaded if it is not available.
-  local/fsp_prepare_dict.sh $spanish_lexicon
-  # Let's keep the original dict copy for G2P training
-  cp -r data/local/dict data/local/dict_orig
-  (
-    steps/dict/train_g2p_seq2seq.sh data/local/dict_orig/lexicon.txt exp/g2p || touch exp/g2p/.error
-  ) &
-
-  # Added c,j, v to the non silences phones manually
-  utils/prepare_lang.sh data/local/dict_orig "<unk>" data/local/lang_orig data/lang_orig
-
-  utils/fix_data_dir.sh data/local/data/train_all
-
-  steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
-
-  utils/fix_data_dir.sh data/local/data/train_all
-  utils/validate_data_dir.sh data/local/data/train_all
-
-  cp -r data/local/data/train_all data/train_all
-
-  # For the CALLHOME corpus
-  utils/fix_data_dir.sh data/local/data/callhome_train_all
-
-  steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/callhome_train_all exp/make_mfcc/callhome_train_all $mfccdir || exit 1;
-
-  utils/fix_data_dir.sh data/local/data/callhome_train_all
-  utils/validate_data_dir.sh data/local/data/callhome_train_all
-
-  cp -r data/local/data/callhome_train_all data/callhome_train_all
-
-  local/create_splits.sh $split
-  local/callhome_create_splits.sh $split_callhome
-  
-fi
-
-if $start_textcleanup; then
-  echo "WARNING : Starting from cleaning up and normalizing the Gigword text"
-  echo "          This might take few days........... You can opt out this stage "
-  echo "          by setting start_textcleanup=false, and having text_lm ready inside rnnlm_workdir."
-
-  if [ $stage -le 0 ]; then
-    mkdir -p "$rnnlm_workdir"/gigaword_rawtext
-    local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath"  "$rnnlm_workdir"/flattened_gigaword_corpus 24
-    cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt
-    local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/  \
-			   "$rnnlm_workdir"/normalised_gigaword_corpus/
-    mkdir -p "$rnnlm_workdir"/text_lm
-    cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt
-    cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt  # For RNNLM and POCOLM training we use dev2/text as dev file.
-    cp "$rnnlm_workdir"/normalised_gigaword_corpus/text_normalized "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
-    if $addtraintext; then
-        cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
-    fi
-  fi
-fi
-
-if [ $stage -le 1 ]; then
-    local/train_pocolm.sh --stage $lmstage --num-words-pocolm $num_words_pocolm "$rnnlm_workdir"/text_lm/ "$rnnlm_workdir"/pocolm
-    local/get_rnnlm_wordlist.py data/lang_orig/words.txt "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm/words.txt \
-				"$rnnlm_workdir"/rnnlm_wordlist "$rnnlm_workdir"/oov_pocolmwords
-    if $train_rnnlm; then
-        local/rnnlm.sh --stage $lmstage --dir "$rnnlm_workdir"/rnnlm --pocolm-dir "$rnnlm_workdir"/pocolm/lm/"$num_words_pocolm"_3.pocolm \
-		   --wordslist "$rnnlm_workdir"/rnnlm_wordlist --text-dir "$rnnlm_workdir"/text_lm
-    fi
-fi
-
-
-if [ $stage -le 2 ]; then
-  wait # wait till G2P training finishes
-  if [ -f exp/g2p/.error ]; then
-     rm exp/g2p/.error || true
-     echo "Fail to train the G2P model." && exit 1;
-  fi
-  steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex
-  cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^$/d" |sort | uniq  > "$rnnlm_workdir"/lexicon_extended.txt
-  cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version.
- 
-  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
-
-  # Make sure that you do not use your test and your dev sets to train the LM
-  # Some form of cross validation is possible where you decode your dev/set based on an
-  # LM that is trained on  everything but that that conversation
-  # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
-  # to get the numbers. Depending on your needs, you might have to change the size of
-  # the splits within that file. The default paritions are based on the Kaldi + Joshua
-  # requirements which means that I have very large dev and test sets
-  local/fsp_train_lms.sh $split
-  local/fsp_create_test_lang.sh
-
-  # Now compute CMVN stats for the train, dev and test subsets
-  steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
-  steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
-  steps/compute_cmvn_stats.sh data/dev2 exp/make_mfcc/dev2 $mfccdir
-  #steps/compute_cmvn_stats.sh data/mt_train exp/make_mfcc/mt_train $mfccdir
-  #steps/compute_cmvn_stats.sh data/mt_test exp/make_mfcc/mt_test $mfccdir
-
-  #n=$[`cat data/train_all/segments | wc -l` - 158126]
-  #utils/subset_data_dir.sh --last data/train_all $n data/train
-  steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
-
-  steps/compute_cmvn_stats.sh data/callhome_dev exp/make_mfcc/callhome_dev $mfccdir
-  steps/compute_cmvn_stats.sh data/callhome_test exp/make_mfcc/callhome_test $mfccdir
-  steps/compute_cmvn_stats.sh data/callhome_train exp/make_mfcc/callhome_train $mfccdir
-
-  # Again from Dan's recipe : Reduced monophone training data
-  # Now-- there are 1.6 million utterances, and we want to start the monophone training
-  # on relatively short utterances (easier to align), but not only the very shortest
-  # ones (mostly uh-huh).  So take the 100k shortest ones, and then take 10k random
-  # utterances from those.
-
-  utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort
-  utils/subset_data_dir.sh  data/train_100kshort 10000 data/train_10k
-  utils/data/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup
-  utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k
-  utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
-fi
-
-if [ $stage -le 3 ]; then
-  steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
-    data/train_10k_nodup data/lang exp/mono0a
-
-  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-    data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
-
-  steps/train_deltas.sh --cmd "$train_cmd" \
-    2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1;
-
-
-  (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
-  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-    exp/tri1/graph data/dev exp/tri1/decode_dev)&
-
-  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-    data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
-
-  steps/train_deltas.sh --cmd "$train_cmd" \
-    2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1;
-
-  (
-    utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
-    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-      exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
-   )&
-fi
-
-if [ $stage -le 4 ]; then
-  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-    data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1;
-
-# Train tri3a, which is LDA+MLLT, on 100k data.
-  steps/train_lda_mllt.sh --cmd "$train_cmd" \
-   --splice-opts "--left-context=3 --right-context=3" \
-   3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1;
-  (
-    utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
-    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-     exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
-  )&
-fi
-
-if [ $stage -le 5 ]; then
-# Next we'll use fMLLR and train with SAT (i.e. on
-# fMLLR features)
-  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-    data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;
-
-  steps/train_sat.sh  --cmd "$train_cmd" \
-    4000 60000 data/train_100k data/lang exp/tri3a_ali  exp/tri4a || exit 1;
-
-  (
-    utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
-    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-      exp/tri4a/graph data/dev exp/tri4a/decode_dev
-)&
-
-
-  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-    data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
-
-# Reduce the number of gaussians
-  steps/train_sat.sh  --cmd "$train_cmd" \
-    5000 120000 data/train data/lang exp/tri4a_ali  exp/tri5a || exit 1;
-
-  (
-    utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
-    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-      exp/tri5a/graph data/dev exp/tri5a/decode_dev
-    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-      exp/tri5a/graph data/test exp/tri5a/decode_test
-
-  # Decode CALLHOME
-    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-      exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test
-    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-      exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev
-    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-      exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train
-    ) &
-
-
-   steps/align_fmllr.sh \
-     --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \
-     data/train data/lang exp/tri5a exp/tri5a_ali
-fi
-
-if $train_sgmm2; then
-
-steps/train_ubm.sh \
-  --cmd "$train_cmd" 750 \
-  data/train data/lang exp/tri5a_ali exp/ubm5
-
-steps/train_sgmm2.sh \
-  --cmd "$train_cmd" 5000 18000 \
-  data/train data/lang exp/tri5a_ali exp/ubm5/final.ubm exp/sgmm5
-
-utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
-
-(
-  steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \
-    --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \
-   exp/sgmm5/graph data/dev exp/sgmm5/decode_dev
-)&
-
-steps/align_sgmm2.sh \
-  --nj 32  --cmd "$train_cmd" --transform-dir exp/tri5a_ali \
-  --use-graphs true --use-gselect true \
-  data/train data/lang exp/sgmm5 exp/sgmm5_ali
-
-steps/make_denlats_sgmm2.sh \
-  --nj 32 --sub-split 32 --num-threads 4 \
-  --beam 10.0 --lattice-beam 6 --cmd "$decode_cmd" --transform-dir exp/tri5a_ali \
-  data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats
-
-steps/train_mmi_sgmm2.sh \
-  --cmd "$train_cmd" --drop-frames true --transform-dir exp/tri5a_ali --boost 0.1 \
-  data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
-  exp/sgmm5_mmi_b0.1
-
-(
-utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
-steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \
-  --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 12"\
- exp/tri5a/graph data/dev exp/tri5a/decode_dev
-utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
-steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \
-  --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \
- exp/sgmm5/graph data/dev exp/sgmm5/decode_dev
-for iter in 1 2 3 4; do
-  decode=exp/sgmm5_mmi_b0.1/decode_dev_it$iter
-  mkdir -p $decode
-  steps/decode_sgmm2_rescore.sh  \
-    --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5a/decode_dev \
-    data/lang_test data/dev/  exp/sgmm5/decode_dev $decode
-done
-) &
-fi
-
-wait;
-
-if [ $stage -le 6 ]; then
-  local/chain/run_tdnn_1g.sh --stage 0 --gigaword-workdir $rnnlm_workdir || exit 1;
-fi
-exit 0;
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/steps b/egs/fisher_callhome_spanish/s5_gigaword/steps
deleted file mode 120000
index 1b186770dd1..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/steps
+++ /dev/null
@@ -1 +0,0 @@
-../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5_gigaword/utils b/egs/fisher_callhome_spanish/s5_gigaword/utils
deleted file mode 120000
index a3279dc8679..00000000000
--- a/egs/fisher_callhome_spanish/s5_gigaword/utils
+++ /dev/null
@@ -1 +0,0 @@
-../../wsj/s5/utils/
\ No newline at end of file

From f810119b7a0f93f9aa3b3d2d387cd113248fafa1 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Tue, 2 Apr 2019 10:48:01 -0400
Subject: [PATCH 34/49] Small cleanup for scripts format

---
 egs/fisher_callhome_spanish/s5/cmd.sh                     | 4 ++--
 egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh | 6 +++---
 egs/fisher_callhome_spanish/s5/steps                      | 2 +-
 egs/fisher_callhome_spanish/s5/utils                      | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh
index db97f1fbc6f..88db78823a5 100755
--- a/egs/fisher_callhome_spanish/s5/cmd.sh
+++ b/egs/fisher_callhome_spanish/s5/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="retry.pl queue.pl --mem 8G"
-export decode_cmd="retry.pl queue.pl --mem 8G"
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
 export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
index 2f478419a18..9e9e6efe7df 100755
--- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -30,7 +30,7 @@ reporting_email=
 gigaword_workdir=
 
 # LSTM/chain options
-train_stage=-20
+train_stage=-10
 xent_regularize=0.1
 dropout_schedule='0,0@0.20,0.3@0.50,0'
 
@@ -157,7 +157,7 @@ if [ $stage -le 19 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
@@ -202,7 +202,7 @@ fi
 
 
 if [ $stage -le 20 ]; then
-  if [[ $(hostname -f) == *.clsp.joujhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
diff --git a/egs/fisher_callhome_spanish/s5/steps b/egs/fisher_callhome_spanish/s5/steps
index 1b186770dd1..6e99bf5b5ad 120000
--- a/egs/fisher_callhome_spanish/s5/steps
+++ b/egs/fisher_callhome_spanish/s5/steps
@@ -1 +1 @@
-../../wsj/s5/steps/
\ No newline at end of file
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5/utils b/egs/fisher_callhome_spanish/s5/utils
index a3279dc8679..b240885218f 120000
--- a/egs/fisher_callhome_spanish/s5/utils
+++ b/egs/fisher_callhome_spanish/s5/utils
@@ -1 +1 @@
-../../wsj/s5/utils/
\ No newline at end of file
+../../wsj/s5/utils
\ No newline at end of file

From dc8a56e5bacbfbbee7573f00bbceed78398858c4 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Fri, 5 Apr 2019 06:57:03 -0400
Subject: [PATCH 35/49] Cosmetic fix

---
 egs/fisher_callhome_spanish/s5/run.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 95425c29034..17ef6313e5e 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -80,17 +80,18 @@ if [ $stage -le -1 ]; then
   
 fi
 
-if $start_textcleanup; then
-  echo "WARNING : Starting from cleaning up and normalizing the Gigword text"
-  echo "          This might take few days........... You can opt out this stage "
-  echo "          by setting start_textcleanup=false, and having text_lm ready inside rnnlm_workdir."
-
-  if [ $stage -le 0 ]; then
+if [ $stage -le 0 ]; then
+  if $start_textcleanup; then
+    echo "WARNING : Starting from cleaning up and normalizing the Gigword text"
+    echo "          This might take few days........... You can skip out this stage "
+    echo "          by setting start_textcleanup=false, and having normalised_gigaword_corpus/text_normalized ready inside $rnnlm_workdir."
+    	
     mkdir -p "$rnnlm_workdir"/gigaword_rawtext
     local/flatten_gigaword/flatten_all_gigaword.sh "$gigaword_datapath"  "$rnnlm_workdir"/flattened_gigaword_corpus 24
     cat "$rnnlm_workdir"/flattened_gigaword_corpus/*.flat > "$rnnlm_workdir"/gigaword_rawtext/in.txt
     local/clean_txt_dir.sh "$rnnlm_workdir"/gigaword_rawtext/  \
 			   "$rnnlm_workdir"/normalised_gigaword_corpus/
+  fi
     mkdir -p "$rnnlm_workdir"/text_lm
     cut -d " " -f 2- data/train/text > "$rnnlm_workdir"/text_lm/train.txt
     cut -d " " -f 2- data/dev2/text > "$rnnlm_workdir"/text_lm/dev.txt  # For RNNLM and POCOLM training we use dev2/text as dev file.
@@ -98,7 +99,6 @@ if $start_textcleanup; then
     if $addtraintext; then
         cat "$rnnlm_workdir"/text_lm/train.txt >> "$rnnlm_workdir"/text_lm/spanish_gigaword_normalised.txt
     fi
-  fi
 fi
 
 if [ $stage -le 1 ]; then

From 8b8222e58dd8b5814ec29b4550b42cf613389372 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Thu, 18 Apr 2019 11:28:58 -0400
Subject: [PATCH 36/49] Remove virtenv dependency

---
 egs/fisher_callhome_spanish/s5/path.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh
index 2993311fd90..c4b93124d7c 100755
--- a/egs/fisher_callhome_spanish/s5/path.sh
+++ b/egs/fisher_callhome_spanish/s5/path.sh
@@ -9,5 +9,3 @@ export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
 export PATH=$SPARROWHAWK_ROOT/bin:$PATH
 export LC_ALL=C
 export LANG=C
-
-source ~/anaconda/bin/activate py36

From 0e7afa828153697390293a4469f78c5d0600caca Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Fri, 19 Apr 2019 14:13:27 +0530
Subject: [PATCH 37/49] Update path.sh

---
 egs/fisher_callhome_spanish/s5/path.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh
index c4b93124d7c..201edd95876 100755
--- a/egs/fisher_callhome_spanish/s5/path.sh
+++ b/egs/fisher_callhome_spanish/s5/path.sh
@@ -3,9 +3,9 @@ export KALDI_ROOT=`pwd`/../../../
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
-export LD_LIBRARY_PATH=/home/dpovey/libs
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs
 
 export SPARROWHAWK_ROOT=$KALDI_ROOT/tools/sparrowhawk
 export PATH=$SPARROWHAWK_ROOT/bin:$PATH
-export LC_ALL=C
-export LANG=C
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8

From 56d2db9a085cbf55e1345efcf4d10f78fef20c72 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Fri, 19 Apr 2019 14:20:54 +0530
Subject: [PATCH 38/49] Update install_sparrowhawk.sh

---
 tools/install_sparrowhawk.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/install_sparrowhawk.sh b/tools/install_sparrowhawk.sh
index f9bbcb1b28e..645577d4f3b 100755
--- a/tools/install_sparrowhawk.sh
+++ b/tools/install_sparrowhawk.sh
@@ -4,6 +4,7 @@ export CXXFLAGS="-I`pwd`/openfst/include"
 stage=0
 
 if [ $stage -le 0 ] ; then
+    rm -rf re2 protobuf sparrowhawk*
     git clone -b feature/Spanish_normalizer https://github.com/spokencloud/sparrowhawk-resources.git || exit 1;
     patch -p0 < sparrowhawk-resources/local/Makefile.patch || exit 1;
     make openfst || exit 1;
@@ -57,15 +58,14 @@ if [ $stage -le 1 ]; then
 fi
 
 if [ $stage -le 2 ]; then 
-    source ~/anaconda/bin/activate py27 || exit 1;
     cp -r sparrowhawk-resources/language-resources sparrowhawk/ || exit 1;
     cd sparrowhawk/language-resources/en/textnorm/classifier || exit 1;
     . ./path.sh || exit 1;
-    python create_far.py ascii.syms  universal_depot_ascii universal_depot universal_depot.far 
+    python2 create_far.py ascii.syms  universal_depot_ascii universal_depot universal_depot.far 
     thraxmakedep tokenize_and_classify.grm || exit 1;
     make || exit 1;
     cd ../verbalizer
-    python create_far.py ascii.syms  number_names_depot_ascii number_names_depot number_names_depot.far
+    python2 create_far.py ascii.syms  number_names_depot_ascii number_names_depot number_names_depot.far
     cp -r ../classifier/universal_depot.far .
     thraxmakedep  verbalize.grm || exit 1;
     make || exit 1;

From fb6693e795d861ff47b656806e62cb93fdb1751d Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Sat, 20 Apr 2019 20:26:09 +0530
Subject: [PATCH 39/49] Set lang to ESP

---
 tools/install_sparrowhawk.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/install_sparrowhawk.sh b/tools/install_sparrowhawk.sh
index 645577d4f3b..b6a7af211f5 100755
--- a/tools/install_sparrowhawk.sh
+++ b/tools/install_sparrowhawk.sh
@@ -59,7 +59,7 @@ fi
 
 if [ $stage -le 2 ]; then 
     cp -r sparrowhawk-resources/language-resources sparrowhawk/ || exit 1;
-    cd sparrowhawk/language-resources/en/textnorm/classifier || exit 1;
+    cd sparrowhawk/language-resources/esp/textnorm/classifier || exit 1;
     . ./path.sh || exit 1;
     python2 create_far.py ascii.syms  universal_depot_ascii universal_depot universal_depot.far 
     thraxmakedep tokenize_and_classify.grm || exit 1;

From ce0f42012583c8506d47d3b4a994a3085411ebc1 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Tue, 23 Apr 2019 13:35:33 +0530
Subject: [PATCH 40/49] Set pocolm option - --limit-unk-history=true

---
 egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
index 0e71be29119..0a5649c2a79 100755
--- a/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
+++ b/egs/fisher_callhome_spanish/s5/local/pocolm_cust.sh
@@ -52,10 +52,10 @@ bypass_metaparam_optim_opt=
 #for order in 3; do
 #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
 
-limit_unk_history_opt=
+#limit_unk_history_opt=
 # If you want to limit the left of <unk> in the history of a n-gram
 # un-comment the following line
-#limit_unk_history_opt="--limit-unk-history=true"
+limit_unk_history_opt="--limit-unk-history=true"
 
 for order in ${ngram_order}; do
   # decide on the vocabulary.

From 9487ce1fa4219273a538e3427d6fb8a0d0005c6b Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Tue, 23 Apr 2019 09:39:08 -0400
Subject: [PATCH 41/49] Removed unused code

---
 egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
index 9e9e6efe7df..b6723c8a523 100755
--- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -255,12 +255,6 @@ if [ $stage -le 21 ]; then
 
 fi
 
-# Let's train first a small RNNLM on Fisher train set
-rnnlmdir=exp/rnnlm_lstm_tdnn_1b
-if [ $stage -le 22 ]; then
-  rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
-fi
-
 if [ $stage -le 23 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
@@ -283,8 +277,6 @@ if [ $stage -le 23 ]; then
         bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \
               ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1;
       fi
-      bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
-	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
     ) || touch $dir/.error &
   done
   wait

From 25609c53cb1c6871dfe16595e57f7a8a0ebd7d5b Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Tue, 23 Apr 2019 14:15:51 -0400
Subject: [PATCH 42/49] Fix in checking for empty space lines in lexicon

---
 egs/fisher_callhome_spanish/s5/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 17ef6313e5e..07eeddac78e 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -119,7 +119,7 @@ if [ $stage -le 2 ]; then
      echo "Fail to train the G2P model." && exit 1;
   fi
   steps/dict/apply_g2p_seq2seq.sh "$rnnlm_workdir"/oov_pocolmwords exp/g2p "$rnnlm_workdir"/oov_g2p.lex
-  cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^$/d" |sort | uniq  > "$rnnlm_workdir"/lexicon_extended.txt
+  cat "$rnnlm_workdir"/oov_g2p.lex/lexicon.lex data/local/dict/lexicon.txt | sed "/^[[:space:]]*$/d" | sort | uniq  > "$rnnlm_workdir"/lexicon_extended.txt
   cp "$rnnlm_workdir"/lexicon_extended.txt data/local/dict/lexicon.txt # Replacing original lexicon with extended version.
  
   utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang

From 510db0f6c72dc6c7b223400a9e203ecdacd3d390 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <valluri@govivace.com>
Date: Thu, 25 Apr 2019 07:39:30 -0400
Subject: [PATCH 43/49] Fix in RNNLM rescoring decode stage

---
 egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
index b6723c8a523..3e400914521 100755
--- a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -274,6 +274,7 @@ if [ $stage -le 23 ]; then
           $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
       done
       if [ $gigaword_workdir ]; then
+	lmtype=fsp_train
         bash rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $gigaword_workdir/rnnlm data/${data}_hires/ \
               ${dir}/decode_${lmtype}_${data} $dir/decode_gigaword_RNNLM_${lmtype}_${data} || exit 1;
       fi

From 9894f4c7b48d34c0511fb430565436a747b99f9c Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Sat, 27 Apr 2019 00:24:03 +0530
Subject: [PATCH 44/49] Update run.sh

---
 egs/fisher_callhome_spanish/s5/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 07eeddac78e..27a5f2aef82 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -13,7 +13,7 @@ start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to
 addtraintext=true # If true, this option appends the Fisher train text to the Gigaword corpus textfile, to 
                   # perform the A, A + G, Dev type POCOLM training configuration.
 		  # A=fsp train, G=gigword text, 
-num_words_pocolm=110000
+num_words_pocolm=100000
 train_sgmm2=false
 
 # call the next line with the directory where the Spanish Fisher data is

From 3bdb541f769c3432c9df6ed6007861275b1a30c8 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Mon, 20 May 2019 19:01:08 +0530
Subject: [PATCH 45/49] Update clean_txt_dir.sh

---
 egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh
index 1880b3a90cb..5d25e3a3fd2 100755
--- a/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh
+++ b/egs/fisher_callhome_spanish/s5/local/clean_txt_dir.sh
@@ -44,7 +44,7 @@ if [ $stage -le 0 ]; then
   $train_cmd --max_jobs_run 100 JOB=1:$numsplits $outdir/sparrowhawk/log/JOB.log \
     local/run_norm.sh \
     sparrowhawk_configuration.ascii_proto \
-    $SPARROWHAWK_ROOT/language-resources/en/sparrowhawk/ \
+    $SPARROWHAWK_ROOT/language-resources/esp/sparrowhawk/ \
     $outdir/data \
     JOB \
     $outdir/sparrowhawk/

From 6636557c72884771bfb656e302510e6ab4074c91 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Sun, 9 Jun 2019 10:53:51 +0530
Subject: [PATCH 46/49] Update run.sh

---
 egs/fisher_callhome_spanish/s5/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 27a5f2aef82..c1d20134b50 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -27,8 +27,8 @@ callhome_speech=/export/corpora/LDC/LDC96S35
 callhome_transcripts=/export/corpora/LDC/LDC96T17
 split_callhome=local/splits/split_callhome
 
-gigaword_datapath=/export/c03/svalluri/Spanish_gigaword/data
-rnnlm_workdir=workdir_rnnlm_Spanish_08032019
+gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data
+rnnlm_workdir=workdir_rnnlm_Spanish_gigaword
 mfccdir=`pwd`/mfcc
 
 . ./cmd.sh

From 36499a74b2da109302a74b7eb8f7fdc5aa670bda Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Sun, 7 Jul 2019 19:44:08 +0530
Subject: [PATCH 47/49] Update run.sh

---
 egs/fisher_callhome_spanish/s5/run.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index c1d20134b50..70d4d0555a4 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -6,6 +6,9 @@
 
 stage=-1
 lmstage=-2
+
+# GIGAWORD RNNLM training based options below.
+# GIGAWORD RAW CORPUS DATA is assumed to be already downloaded in the gigaword_datapath.
 train_rnnlm=false
 start_textcleanup=false # WARNING : IT starts from flattening gigaword corpus to preparing text folder.
                         # If you already have the normalised gigword text somewhere, you can bypass the
@@ -27,7 +30,7 @@ callhome_speech=/export/corpora/LDC/LDC96S35
 callhome_transcripts=/export/corpora/LDC/LDC96T17
 split_callhome=local/splits/split_callhome
 
-gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data
+gigaword_datapath=/export/c03/svalluri/Spanish_gigaword_rawcorpus/data # GIGAWORD RAW CORPUS DATA DOWNLOAD PATH
 rnnlm_workdir=workdir_rnnlm_Spanish_gigaword
 mfccdir=`pwd`/mfcc
 

From 8da5c3e97053bd46d6855f761d5cb3d7a185106f Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Sat, 13 Jul 2019 11:01:49 +0530
Subject: [PATCH 48/49] Reverse the order of Abbreviation process after punct
 syms

---
 egs/fisher_callhome_spanish/s5/local/run_norm.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/local/run_norm.sh b/egs/fisher_callhome_spanish/s5/local/run_norm.sh
index f88fecc815c..a1a171a5ba6 100755
--- a/egs/fisher_callhome_spanish/s5/local/run_norm.sh
+++ b/egs/fisher_callhome_spanish/s5/local/run_norm.sh
@@ -24,13 +24,16 @@ for i in "${punctuation_symbols[@]}"; do
     num_syms=$((num_syms+1))
 done
 mkdir -p $dir/normalize/$job
-local/clean_abbrevs_text.py $data/$job $data/"$job"_processed
-mv $data/"$job"_processed $data/$job
+
 echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh
  
 bash $dir/normalize/$job/substitute.sh | \
     sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \
     sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç'  > $dir/normalize/$job/text
+    
+local/clean_abbrevs_text.py $dir/normalize/$job/text $data/"$job"_processed
+mv $data/"$job"_processed $dir/normalize/$job/text
+
 normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt
 
 exit 0;

From 510b415d30fdb553ed26fe6907cc37a1118b4d01 Mon Sep 17 00:00:00 2001
From: saikiranvalluri <41471921+saikiranvalluri@users.noreply.github.com>
Date: Wed, 21 Aug 2019 13:36:11 +0530
Subject: [PATCH 49/49] Update run_norm.sh

---
 egs/fisher_callhome_spanish/s5/local/run_norm.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/local/run_norm.sh b/egs/fisher_callhome_spanish/s5/local/run_norm.sh
index a1a171a5ba6..839636ea21a 100755
--- a/egs/fisher_callhome_spanish/s5/local/run_norm.sh
+++ b/egs/fisher_callhome_spanish/s5/local/run_norm.sh
@@ -29,10 +29,10 @@ echo "cat $data/$job | $substitute_arg" > $dir/normalize/$job/substitute.sh
  
 bash $dir/normalize/$job/substitute.sh | \
     sed "s: 's:'s:g" | sed "s: 'm:'m:g" | \
-    sed "s: \s*: :g" | tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç'  > $dir/normalize/$job/text
+    sed "s: \s*: :g" > $dir/normalize/$job/text
     
 local/clean_abbrevs_text.py $dir/normalize/$job/text $data/"$job"_processed
-mv $data/"$job"_processed $dir/normalize/$job/text
+tr 'A-ZÂÁÀÄÊÉÈËÏÍÎÖÓÔÖÚÙÛÑÇ' 'a-zâáàäêéèëïíîöóôöúùûñç' < $data/"$job"_processed > $dir/normalize/$job/text
 
 normalizer_main --config=$config --path_prefix=$path_prefix <$dir/normalize/$job/text >$dir/$job.txt