Last active
April 23, 2018 15:44
-
-
Save loociano/e43b63e10dd934efde64f301215f389f to your computer and use it in GitHub Desktop.
Convert an HTML paginated table to CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This script downloads HTML tables and converts them to CSV. | |
# Luc Rubio <luc@loociano.com> | |
# Config | |
CSV=results.csv | |
URL='https://www.sportsplits.com/m3/event?c=74&r=370&e=1&pg=' | |
NUM_PAGES=81 | |
HEADINGS='Position,Name,Club,Net Time,Category (POS),Gender (POS)' | |
# End config. Do not edit below | |
TMP=results.html | |
# Init | |
echo '' > $TMP | |
echo $HEADINGS > $CSV | |
# Download | |
for i in $( seq 1 $NUM_PAGES ) | |
do | |
wget --header="Accept: text/html" --user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" -O - $URL$i >> $TMP | |
done | |
# Transform | |
# Use ; sign to tag end of cells | |
cat $TMP | sed -r 's/><\/td>/>-<\/td>/g' | sed -r 's/<\/td>/;<\/td>/g' | html2text -width 200 | grep -E "^[0-9]+\s;" | sed -r 's/;/,/g' | sed -r 's/\_/ /g' | sed -r 's/,\s+/,/g' | sed -r 's/\s+,/,/g' >> $CSV | |
rm $TMP |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment