dchaplinsky · April 6, 2023 14:30
diff --git a/instructions_retrieval.sh b/instructions_retrieval.sh
 #!/bin/bash

 # You will need `apt get parallel pv` to make it run

 # download file containing urls
 curl http://webdatacommons.org/structureddata/2022-12/files/file.list > urls.txt

 # create output file
 touch output.txt

 # use parallel command to download/grep in parallel
 cat urls.txt | pv -cN Input | parallel -j 4 "curl -s {} | zcat | grep -e '<http://schema.org/FAQPage>' -e '<http://schema.org/HowTo>'" | pv -cN Output > output.txt
	#!/bin/bash

	# You will need `apt get parallel pv` to make it run

	# download file containing urls
	curl http://webdatacommons.org/structureddata/2022-12/files/file.list > urls.txt

	# create output file
	touch output.txt

	# use parallel command to download/grep in parallel
	cat urls.txt \| pv -cN Input \| parallel -j 4 "curl -s {} \| zcat \| grep -e '<http://schema.org/FAQPage>' -e '<http://schema.org/HowTo>'" \| pv -cN Output > output.txt