maxbane · June 20, 2011 05:06 · kylebgorman · Jul 18, 2011 · AmineBarrak · Oct 29, 2021
diff --git a/enum-csv-urls.sh b/enum-csv-urls.sh
 #!/bin/bash
 # enum-csv-urls.sh
 # Copyright 2011 by Max Bane
 # Distributed under a Creative Commons Attribution 3.0 Unported License:
 # http://creativecommons.org/licenses/by/3.0/

 # turn off exclamation point annoyances, a.k.a. bash's dumbest "feature"
 histchars=

 # defaults

 URLBASE="http://commondatastorage.googleapis.com/books/ngrams/books"
 CORPUS="eng"
 VERSION="20090715"
 NGRAMLIST="1,2,3,4,5"

 USAGESTR="
 $0 [-h] [-b URLBASE] [-c CORPUS] [-v VERSION] [-n NGRAMLIST]

 Enumerate all of the CSV download URLs for the files of the Google Books n-gram
 corpus (http://ngrams.googlelabs.com/datasets), for a given language and
 version, and print each such URL as a line to standard out. This is useful for
 piping to xargs in combination with some downloading tool (e.g., wget). For
 example, to download the entire French n-gram corpus using three parallel wget
 processes:

 $ ./enum-csv-urls.sh -c fre | xargs -P 3 -n 1 wget -nv

 Options:
    -h          Print this help message and exit.

    -b URLBASE  Use URLBASE as the base prefix of the CSV files. Default value
                is \"$URLBASE\".

    -c CORPUS   Use CORPUS as the subcorpus-identifier string, which is embedded
                in the CSV filenames. Default value is \"$CORPUS\". Valid values
                as of 06/18/2011 are \"eng\" (all English), \"eng-1M\" (English
                1 Million), \"eng-us\" (US English), \"eng-gb\" (British
                English), \"eng-fiction\" (English fiction), \"chi-sim\"
                (Simplified Chinese), \"fre\" (French), \"ger\" (German),
                \"heb\" (Hebrew), \"rus\" (Russian), \"spa\" (Spanish). Check
                the Google Labs page for updates.

    -v VERSION  Use VERSION as the version identifier string, which is embedded
                in each CSV filename. Default value is \"$VERSION\". Check the
                Google Labs page for updates. 

    -n NGRAMLIST Only enumerate CSV filenames belonging to one of the n-gram
                sets indicated by NGRAMLIST, which is a comma-separated list of
                integers. Default value \"$NGRAMLIST\" specifies all n-grams,
                from unigrams through 5-grams.

 Copyright 2011 by Max Bane
 Distributed under a Creative Commons Attribution 3.0 Unported License:
 http://creativecommons.org/licenses/by/3.0/
 "


 # number of files for each n
 NUMFILES[1]=9
 NUMFILES[2]=99
 NUMFILES[3]=199
 NUMFILES[4]=399
 NUMFILES[5]=799

 # parse options
 while getopts ":hb:c:v:n:" opt; do
    case $opt in
        h) echo "$USAGESTR"; exit 0;;

        b) URLBASE=$OPTARG;;

        c) CORPUS=$OPTARG;;

        v) VERSION=$OPTARG;;

        n) NGRAMLIST=$OPTARG;; # FIXME: validate?

        \?) echo "Unknown option: -$OPTARG" >&2; exit 1;;

        :) echo "Option -$OPTARG requires an argument." >&2; exit 1;;
    esac
 done


 build_url () 
 {
    # $1: ngram n
    # $2: file number
    URL="${URLBASE}/googlebooks-${CORPUS}-all-${1}gram-${VERSION}-${2}.csv.zip"
 }

 build_totalcounts_url ()
 {
    URL="${URLBASE}/googlebooks-${CORPUS}-all-totalcounts-${VERSION}.txt"
 }

 # single text file with the total counts of unigrams
 build_totalcounts_url
 echo $URL

 # main loop
 for NGRAM in `echo $NGRAMLIST | tr "," "\n"`; do
    MAXFILE=${NUMFILES[$NGRAM]}
    if [ -z $MAXFILE ]; then
        echo "ERROR: Skipping invalid element of NGRAM list: $NGRAM" >&2
        continue
    fi

    let FILENUM=0
    while (($FILENUM <= $MAXFILE)); do
        build_url $NGRAM $FILENUM
        echo $URL
        ((FILENUM += 1))
    done

 done
	#!/bin/bash
	# enum-csv-urls.sh
	# Copyright 2011 by Max Bane
	# Distributed under a Creative Commons Attribution 3.0 Unported License:
	# http://creativecommons.org/licenses/by/3.0/

	# turn off exclamation point annoyances, a.k.a. bash's dumbest "feature"
	histchars=

	# defaults

	URLBASE="http://commondatastorage.googleapis.com/books/ngrams/books"
	CORPUS="eng"
	VERSION="20090715"
	NGRAMLIST="1,2,3,4,5"

	USAGESTR="
	$0 [-h] [-b URLBASE] [-c CORPUS] [-v VERSION] [-n NGRAMLIST]

	Enumerate all of the CSV download URLs for the files of the Google Books n-gram
	corpus (http://ngrams.googlelabs.com/datasets), for a given language and
	version, and print each such URL as a line to standard out. This is useful for
	piping to xargs in combination with some downloading tool (e.g., wget). For
	example, to download the entire French n-gram corpus using three parallel wget
	processes:

	$ ./enum-csv-urls.sh -c fre \| xargs -P 3 -n 1 wget -nv

	Options:
	-h Print this help message and exit.

	-b URLBASE Use URLBASE as the base prefix of the CSV files. Default value
	is \"$URLBASE\".

	-c CORPUS Use CORPUS as the subcorpus-identifier string, which is embedded
	in the CSV filenames. Default value is \"$CORPUS\". Valid values
	as of 06/18/2011 are \"eng\" (all English), \"eng-1M\" (English
	1 Million), \"eng-us\" (US English), \"eng-gb\" (British
	English), \"eng-fiction\" (English fiction), \"chi-sim\"
	(Simplified Chinese), \"fre\" (French), \"ger\" (German),
	\"heb\" (Hebrew), \"rus\" (Russian), \"spa\" (Spanish). Check
	the Google Labs page for updates.

	-v VERSION Use VERSION as the version identifier string, which is embedded
	in each CSV filename. Default value is \"$VERSION\". Check the
	Google Labs page for updates.

	-n NGRAMLIST Only enumerate CSV filenames belonging to one of the n-gram
	sets indicated by NGRAMLIST, which is a comma-separated list of
	integers. Default value \"$NGRAMLIST\" specifies all n-grams,
	from unigrams through 5-grams.

	Copyright 2011 by Max Bane
	Distributed under a Creative Commons Attribution 3.0 Unported License:
	http://creativecommons.org/licenses/by/3.0/
	"


	# number of files for each n
	NUMFILES[1]=9
	NUMFILES[2]=99
	NUMFILES[3]=199
	NUMFILES[4]=399
	NUMFILES[5]=799

	# parse options
	while getopts ":hb:c:v:n:" opt; do
	case $opt in
	h) echo "$USAGESTR"; exit 0;;

	b) URLBASE=$OPTARG;;

	c) CORPUS=$OPTARG;;

	v) VERSION=$OPTARG;;

	n) NGRAMLIST=$OPTARG;; # FIXME: validate?

	\?) echo "Unknown option: -$OPTARG" >&2; exit 1;;

	:) echo "Option -$OPTARG requires an argument." >&2; exit 1;;
	esac
	done


	build_url ()
	{
	# $1: ngram n
	# $2: file number
	URL="${URLBASE}/googlebooks-${CORPUS}-all-${1}gram-${VERSION}-${2}.csv.zip"
	}

	build_totalcounts_url ()
	{
	URL="${URLBASE}/googlebooks-${CORPUS}-all-totalcounts-${VERSION}.txt"
	}

	# single text file with the total counts of unigrams
	build_totalcounts_url
	echo $URL

	# main loop
	for NGRAM in `echo $NGRAMLIST \| tr "," "\n"`; do
	MAXFILE=${NUMFILES[$NGRAM]}
	if [ -z $MAXFILE ]; then
	echo "ERROR: Skipping invalid element of NGRAM list: $NGRAM" >&2
	continue
	fi

	let FILENUM=0
	while (($FILENUM <= $MAXFILE)); do
	build_url $NGRAM $FILENUM
	echo $URL
	((FILENUM += 1))
	done

	done