Created
June 22, 2022 02:05
-
-
Save jennynz/08e34c4cbec6d7436bc26f1647b63f5e to your computer and use it in GitHub Desktop.
Filtering out bots from GitHub data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# List of common bots on GitHub | |
# Doesn't include ones that would already be filtered out by the is_bot function | |
# but it won't hurt to also include them in here | |
GITHUB_BOTS = [ | |
'netlify', | |
'linear-app', | |
'codeclimate', | |
'renovate', | |
'renovate-approve', | |
'github-actions', | |
'vercel', | |
'googlebot', | |
'codesandbox-ci', | |
'sizebot', | |
'tensorflow-jenkins', | |
'tensorflowbutler', | |
'google-ml-butler', | |
'google-cla', | |
'coveralls', | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from typing import Optional | |
from bots import GITHUB_BOTS | |
def is_bot( | |
row: pd.Series, | |
author_col: str, | |
bot_col: Optional[str] = None, | |
bot_col_val: Optional[str] = None, | |
) -> bool: | |
# Check if the bot column actually says it's a bot | |
# e.g. pr_author_typename == 'Bot' | |
if ( | |
(bot_col is not None) | |
and (row[bot_col] is not None) | |
and (row[bot_col] == bot_col_val) | |
): | |
return True | |
# If the bot column doesn't hold any evidence, rely on the author name | |
author = row[author_col] | |
return ( | |
'-bot' in author | |
or '[bot]' in author | |
or 'dependabot' in author | |
or author in GITHUB_BOTS | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from is_bot import is_bot | |
def test_is_bot_checks_author_col_for_mention_of_bot(): | |
assert is_bot(pd.Series({'pr_author': 'bertie-bot'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'bertie-bott'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'bertie [bot]'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'bertie[bot]'}), author_col='pr_author') | |
assert is_bot( | |
pd.Series({'pr_author': 'facebook-github-bot'}), author_col='pr_author' | |
) | |
def test_is_bot_catches_all_types_of_dependabot(): | |
assert is_bot(pd.Series({'pr_author': 'dependabot'}), author_col='pr_author') | |
assert is_bot( | |
pd.Series({'pr_author': 'dependabot-preview'}), author_col='pr_author' | |
) | |
assert is_bot( | |
pd.Series({'pr_author': 'dependabot-preview'}), author_col='pr_author' | |
) | |
def test_is_bot_catches_specific_bot_names(): | |
assert is_bot(pd.Series({'pr_author': 'netlify'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'linear-app'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'codeclimate'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'renovate'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'renovate-approve'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'github-actions'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'vercel'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'googlebot'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'google-cla'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'codesandbox-ci'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'sizebot'}), author_col='pr_author') | |
assert is_bot( | |
pd.Series({'pr_author': 'tensorflow-jenkins'}), author_col='pr_author' | |
) | |
assert is_bot(pd.Series({'pr_author': 'tensorflowbutler'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'google-cla'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'google-ml-buter'}), author_col='pr_author') | |
assert is_bot(pd.Series({'pr_author': 'coveralls'}), author_col='pr_author') | |
def test_is_bot_does_not_filter_out_incidental_botlike_substrings(): | |
assert not is_bot(pd.Series({'pr_author': 'hannah abbott'}), author_col='pr_author') | |
assert not is_bot( | |
pd.Series({'pr_author': 'i-like-to-renovate'}), author_col='pr_author' | |
) | |
def test_is_bot_prioritises_bot_col(): | |
assert is_bot( | |
pd.Series( | |
{'pr_author': 'human being sounding name', 'pr_author_typename': 'Bot'} | |
), | |
author_col='pr_author', | |
bot_col='pr_author_typename', | |
bot_col_val='Bot', | |
) | |
# bot_col_val overrides the fact that the author name is not bot-like | |
assert is_bot( | |
pd.Series( | |
{'pr_author': 'human being sounding name', 'pr_author_typename': 'Bot'} | |
), | |
author_col='pr_author', | |
bot_col='pr_author_typename', | |
bot_col_val='Bot', | |
) | |
# bot_col_val is not correct | |
assert not is_bot( | |
pd.Series( | |
{'pr_author': 'human being sounding name', 'pr_author_typename': 'User'} | |
), | |
author_col='pr_author', | |
bot_col='pr_author_typename', | |
bot_col_val='Bot', | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment