Created
July 26, 2018 18:40
-
-
Save iarroyof/e7f88611ca04a984d899b49e36c201c1 to your computer and use it in GitHub Desktop.
Bash script for creating Glove word embeddings chrashes my hard disk for -vector-size >= 300
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
# Makes programs, downloads sample data, trains a GloVe model, and then evaluates it. | |
# One optional argument can specify the language used for eval script: matlab, octave or [default] python | |
# Default configuration: | |
# dimensions: 100, minimum_word_frequency: 1, window: 8 and number_of_threads: 20. | |
# | |
# bash train_glove.sh -i $DATA/GUsDany/corpus/GUs_literature.txt\ | |
# -o $DATA/vectors_H100_W8 -s 100 -m 1 -w 8 -t 20 | |
while getopts ":i:o:s:m:w:t:" opt; do | |
case $opt in | |
i) inputf="$OPTARG" | |
;; | |
o) outpuf="$OPTARG" | |
;; | |
s) size="$OPTARG" | |
;; | |
m) minc="$OPTARG" | |
;; | |
w) win="$OPTARG" | |
;; | |
t) tr="$OPTARG" | |
;; | |
\?) echo "Invalid option -$OPTARG" >&2 | |
;; | |
esac | |
done | |
if [ -z ${outpuf+x} ]; then | |
outpuf=glove_ctors | |
fi | |
if [ -z ${size+x} ]; then | |
size=100 | |
fi | |
if [ -z ${minc+x} ]; then | |
minc=5 | |
fi | |
if [ -z ${win+x} ]; then | |
win=15 | |
fi | |
if [ -z ${th+x} ]; then | |
th=8 | |
fi | |
CORPUS=$inputf | |
VOCAB_FILE=vocab.txt | |
COOCCURRENCE_FILE=cooccurrence.bin | |
COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin | |
BUILDDIR=build | |
SAVE_FILE=$outpuf | |
VERBOSE=2 | |
MEMORY=4.0 | |
VOCAB_MIN_COUNT=$minc | |
VECTOR_SIZE=$size | |
MAX_ITER=15 | |
WINDOW_SIZE=$win | |
BINARY=2 | |
NUM_THREADS=$th | |
X_MAX=10 | |
if [ ! -f $VOCAB_FILE ]; then | |
echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE" | |
$BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE | |
fi | |
if [ ! -f $COOCCURRENCE_FILE ]; then | |
echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE" | |
$BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE | |
fi | |
if [ ! -f $COOCCURRENCE_SHUF_FILE]; then | |
echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE" | |
$BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE | |
fi | |
echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE" | |
$BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE | |
if [ "$1" = 'matlab' ]; then | |
matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 | |
elif [ "$1" = 'octave' ]; then | |
octave < ./eval/octave/read_and_evaluate_octave.m 1>&2 | |
else | |
echo "$ python eval/python/evaluate.py" | |
python eval/python/evaluate.py --vectors_file $outpuf.txt | |
fi | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment