Created
December 8, 2023 09:26
-
-
Save sigismund/f3a9bc07481aca625b1b635a36e1eb29 to your computer and use it in GitHub Desktop.
Get IP ranges of "good" crawl bots. Bash version of https://github.com/AnTheMaker/GoodBots
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This is copy of script used in Github Actions of https://github.com/AnTheMaker/GoodBots | |
# Requirements: curl, whois, jq | |
WORKING_DIRECTORY=$(pwd) | |
mkdir -p iplists | |
# Generate GoogleBot IP-List | |
curl -s https://developers.google.com/static/search/apis/ipranges/googlebot.json | jq -r '.prefixes[].ipv6Prefix,.prefixes[].ipv4Prefix | select( . != null )' | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/googlebot.ips | |
# Generate BingBot IP-List | |
curl -s https://www.bing.com/toolbox/bingbot.json | jq -r '.prefixes[].ipv6Prefix,.prefixes[].ipv4Prefix | select( . != null )' | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/bingbot.ips | |
# Generate AhrefsBot IP-List # Source: https://help.ahrefs.com/en/articles/78658-what-is-the-list-of-your-ip-ranges | |
curl -s https://api.ahrefs.com/v3/public/crawler-ips | jq -r '.ips[].ip_address | select( . != null )' | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/ahrefsbot.ips | |
# Generate FacebookBot IP-List # Source: https://developers.facebook.com/docs/sharing/webmasters/crawler/ | |
whois -h whois.radb.net -- '-i origin AS32934' | grep ^route | awk '{gsub("(route:|route6:)","");print}' | awk '{gsub(/ /,""); print}' | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/facebookbot.ips | |
# Generate DuckDuckBot IP-List # Source: https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/ | |
curl -s https://raw.githubusercontent.com/duckduckgo/duckduckgo-help-pages/master/_docs/results/duckduckbot.md | grep "^\- " | awk '{gsub("-",""); print}' | awk '{gsub(/ /,""); print}' | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/duckduckbot.ips | |
# Generate Marginalia IP-List # https://www.marginalia.nu/marginalia-search/for-webmasters/ | |
curl -s https://raw.githubusercontent.com/MarginaliaSearch/www.marginalia.nu/main/content/marginalia-search/for-webmasters.md | grep "^IP address:" | awk '{print $3}' | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/marginalia.ips | |
# Get Telegram IP List | |
curl -s https://core.telegram.org/resources/cidr.txt | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/telegrambot.ips | |
# Get UptimeRobot IP List # Source: https://uptimerobot.com/help/locations/ | |
curl -s https://uptimerobot.com/inc/files/ips/IPv4andIPv6.txt | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/uptimerobot.ips | |
# Get Pingdom IP List # Source: https://documentation.solarwinds.com/en/success_center/pingdom/content/topics/pingdom-probe-servers-ip-addresses.htm | |
curl -s https://my.pingdom.com/probes/ipv4 | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/pingdombot.ips | |
curl -s https://my.pingdom.com/probes/ipv6 | tr -d "[:blank:]" >> $WORKING_DIRECTORY/iplists/pingdombot.ips | |
# Get Stripe Webhook IP List # Source: https://stripe.com/docs/ips | |
curl -s https://stripe.com/files/ips/ips_webhooks.txt | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/stripewebhook.ips | |
# Get RSS API IP List # Source: https://rssapi.net/faq | |
curl -s https://rssapi.net/ips.txt | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/rssapi.ips | |
# TEMP: disabled because of firewall | |
## Get BetterUptime IP List # Source: https://docs.betteruptime.com/frequently-asked-questions | |
#curl -s https://betteruptime.com/ips.txt | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/betteruptimebot.ips | |
# Generate WebpagetestBot IP-List # Source: https://www.webpagetest.org/addresses.php | |
curl -s https://www.webpagetest.org/addresses.php?f=json | jq -r '.data[].addresses[] | select( . != null )' | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/webpagetestbot.ips | |
# Get BunnyCDN IP List # Source: https://support.bunny.net/hc/en-us/articles/115001131172-I-am-seeing-a-lot-of-502-and-504-errors | |
curl -s https://api.bunny.net/system/edgeserverlist/plain | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/bunnycdn.ips | |
curl -s https://api.bunny.net/system/edgeserverlist/ipv6 | jq -r '.[] | select( . != null )' | tr -d "[:blank:]" >> $WORKING_DIRECTORY/iplists/bunnycdn.ips | |
# Get Cloudflare IP List # Source: https://www.cloudflare.com/ips/ | |
curl -s https://www.cloudflare.com/ips-v4 | tr -d "[:blank:]" > $WORKING_DIRECTORY/iplists/cloudflare.ips | |
echo "" >> $WORKING_DIRECTORY/iplists/cloudflare.ips | |
curl -s https://www.cloudflare.com/ips-v6 | tr -d "[:blank:]" >> $WORKING_DIRECTORY/iplists/cloudflare.ips | |
# Create merged/total IP-List (all.ips) # Use grep to merge files and ensure proper new-lines between every file, but remove any empty new-lines | |
grep -h -v '^[[:space:]]*$' $WORKING_DIRECTORY/iplists/*.ips | uniq -u | tr -d "[:blank:]" > $WORKING_DIRECTORY/all.ips |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment